Repository: huggingface/accelerate
Branch: main
Commit: 1622df332f4a
Files: 349
Total size: 3.1 MB

Directory structure:
gitextract_vek8qtxm/

├── .devcontainer/
│   └── devcontainer.json
├── .github/
│   ├── ISSUE_TEMPLATE/
│   │   └── bug-report.yml
│   ├── PULL_REQUEST_TEMPLATE.md
│   └── workflows/
│       ├── build-docker-images-release.yml
│       ├── build_and_run_tests.yml
│       ├── build_docker_images.yml
│       ├── build_documentation.yml
│       ├── build_pr_documentation.yml
│       ├── fp8_runner.yml
│       ├── gaudi3_scheduled.yml
│       ├── integration_tests.yml
│       ├── nightly.yml
│       ├── pr_style_bot.yml
│       ├── quality.yml
│       ├── run_merge_tests.yml
│       ├── self_hosted_integration_tests.yml
│       ├── stale.yml
│       ├── test.yml
│       ├── test_imports.yml
│       ├── trufflehog.yml
│       └── upload_pr_documentation.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── Makefile
├── README.md
├── benchmarks/
│   ├── README.md
│   ├── big_model_inference/
│   │   ├── README.md
│   │   ├── big_model_inference.py
│   │   └── measures_util.py
│   ├── fp8/
│   │   ├── ms_amp/
│   │   │   ├── Dockerfile
│   │   │   ├── ddp.py
│   │   │   ├── distrib_deepspeed.py
│   │   │   ├── fp8_utils.py
│   │   │   └── non_distributed.py
│   │   ├── torchao/
│   │   │   ├── Dockerfile
│   │   │   ├── README.md
│   │   │   ├── ddp.py
│   │   │   ├── distrib_deepspeed.py
│   │   │   ├── fp8_utils.py
│   │   │   ├── fsdp.py
│   │   │   └── non_distributed.py
│   │   └── transformer_engine/
│   │       ├── Dockerfile
│   │       ├── README.md
│   │       ├── ddp.py
│   │       ├── distrib_deepspeed.py
│   │       ├── fp8_utils.py
│   │       ├── fsdp.py
│   │       └── non_distributed.py
│   ├── fsdp2/
│   │   ├── README.md
│   │   ├── main.py
│   │   ├── measure_utils.py
│   │   ├── utils.py
│   │   └── visualize.py
│   └── torch.compile/
│       ├── README.md
│       └── regional_compilation.py
├── docker/
│   ├── README.md
│   ├── accelerate-cpu/
│   │   └── Dockerfile
│   ├── accelerate-gpu/
│   │   └── Dockerfile
│   └── accelerate-gpu-deepspeed/
│       └── Dockerfile
├── docs/
│   ├── Makefile
│   ├── README.md
│   └── source/
│       ├── _toctree.yml
│       ├── basic_tutorials/
│       │   ├── execution.md
│       │   ├── install.md
│       │   ├── launch.md
│       │   ├── migration.md
│       │   ├── notebook.md
│       │   ├── overview.md
│       │   ├── tpu.md
│       │   └── troubleshooting.md
│       ├── concept_guides/
│       │   ├── big_model_inference.md
│       │   ├── context_parallelism.md
│       │   ├── deferring_execution.md
│       │   ├── fsdp1_vs_fsdp2.md
│       │   ├── fsdp_and_deepspeed.md
│       │   ├── gradient_synchronization.md
│       │   ├── internal_mechanism.md
│       │   ├── low_precision_training.md
│       │   ├── performance.md
│       │   ├── sequence_parallelism.md
│       │   └── training_tpu.md
│       ├── index.md
│       ├── package_reference/
│       │   ├── accelerator.md
│       │   ├── big_modeling.md
│       │   ├── cli.md
│       │   ├── deepspeed.md
│       │   ├── fp8.md
│       │   ├── fsdp.md
│       │   ├── inference.md
│       │   ├── kwargs.md
│       │   ├── launchers.md
│       │   ├── logging.md
│       │   ├── megatron_lm.md
│       │   ├── state.md
│       │   ├── torch_wrappers.md
│       │   ├── tracking.md
│       │   └── utilities.md
│       ├── quicktour.md
│       └── usage_guides/
│           ├── big_modeling.md
│           ├── checkpoint.md
│           ├── compilation.md
│           ├── ddp_comm_hook.md
│           ├── deepspeed.md
│           ├── deepspeed_multiple_model.md
│           ├── distributed_inference.md
│           ├── explore.md
│           ├── fsdp.md
│           ├── gaudi.md
│           ├── gradient_accumulation.md
│           ├── intel_cpu.md
│           ├── local_sgd.md
│           ├── low_precision_training.md
│           ├── megatron_lm.md
│           ├── model_size_estimator.md
│           ├── mps.md
│           ├── profiler.md
│           ├── quantization.md
│           ├── sagemaker.md
│           ├── tracking.md
│           └── training_zoo.md
├── examples/
│   ├── README.md
│   ├── alst_ulysses_sequence_parallelism/
│   │   ├── README.md
│   │   ├── sp-alst.accelerate-config.yml
│   │   ├── sp-alst.ds-config.json
│   │   ├── sp-alst.py
│   │   └── sp-alst.sh
│   ├── by_feature/
│   │   ├── README.md
│   │   ├── automatic_gradient_accumulation.py
│   │   ├── checkpointing.py
│   │   ├── cross_validation.py
│   │   ├── ddp_comm_hook.py
│   │   ├── deepspeed_with_config_support.py
│   │   ├── early_stopping.py
│   │   ├── fsdp_with_peak_mem_tracking.py
│   │   ├── gradient_accumulation.py
│   │   ├── gradient_accumulation_for_autoregressive_models.py
│   │   ├── local_sgd.py
│   │   ├── megatron_lm_gpt_pretraining.py
│   │   ├── memory.py
│   │   ├── multi_process_metrics.py
│   │   ├── profiler.py
│   │   ├── schedule_free.py
│   │   └── tracking.py
│   ├── complete_cv_example.py
│   ├── complete_nlp_example.py
│   ├── config_yaml_templates/
│   │   ├── README.md
│   │   ├── deepspeed.yaml
│   │   ├── fp8.yaml
│   │   ├── fsdp.yaml
│   │   ├── multi_gpu.yaml
│   │   ├── multi_node.yaml
│   │   ├── multi_xpu.yaml
│   │   ├── run_me.py
│   │   └── single_accelerator.yaml
│   ├── cv_example.py
│   ├── deepspeed_config_templates/
│   │   ├── zero_stage1_config.json
│   │   ├── zero_stage2_config.json
│   │   ├── zero_stage2_offload_config.json
│   │   ├── zero_stage3_config.json
│   │   └── zero_stage3_offload_config.json
│   ├── finetune_lm_tpu.py
│   ├── inference/
│   │   ├── distributed/
│   │   │   ├── README.md
│   │   │   ├── distributed_image_generation.py
│   │   │   ├── distributed_speech_generation.py
│   │   │   ├── florence2.py
│   │   │   ├── llava_next_video.py
│   │   │   ├── phi2.py
│   │   │   └── stable_diffusion.py
│   │   └── pippy/
│   │       ├── README.md
│   │       ├── bert.py
│   │       ├── gpt2.py
│   │       ├── llama.py
│   │       ├── requirements.txt
│   │       └── t5.py
│   ├── multigpu_remote_launcher.py
│   ├── nlp_example.py
│   ├── requirements.txt
│   ├── slurm/
│   │   ├── fsdp_config.yaml
│   │   ├── submit_multicpu.sh
│   │   ├── submit_multigpu.sh
│   │   ├── submit_multinode.sh
│   │   └── submit_multinode_fsdp.sh
│   └── torch_native_parallelism/
│       ├── README.md
│       ├── configs/
│       │   ├── cp.yaml
│       │   └── tp_hsdp.yaml
│       ├── fsdp2_fp8.py
│       ├── nd_parallel.py
│       ├── nd_parallel_trainer.py
│       └── utils.py
├── manim_animations/
│   ├── big_model_inference/
│   │   ├── stage_1.py
│   │   ├── stage_2.py
│   │   ├── stage_3.py
│   │   ├── stage_4.py
│   │   └── stage_5.py
│   └── dataloaders/
│       ├── stage_0.py
│       ├── stage_1.py
│       ├── stage_2.py
│       ├── stage_3.py
│       ├── stage_4.py
│       ├── stage_5.py
│       ├── stage_6.py
│       └── stage_7.py
├── pyproject.toml
├── setup.py
├── src/
│   └── accelerate/
│       ├── __init__.py
│       ├── accelerator.py
│       ├── big_modeling.py
│       ├── checkpointing.py
│       ├── commands/
│       │   ├── __init__.py
│       │   ├── accelerate_cli.py
│       │   ├── config/
│       │   │   ├── __init__.py
│       │   │   ├── cluster.py
│       │   │   ├── config.py
│       │   │   ├── config_args.py
│       │   │   ├── config_utils.py
│       │   │   ├── default.py
│       │   │   ├── sagemaker.py
│       │   │   └── update.py
│       │   ├── env.py
│       │   ├── estimate.py
│       │   ├── launch.py
│       │   ├── menu/
│       │   │   ├── __init__.py
│       │   │   ├── cursor.py
│       │   │   ├── helpers.py
│       │   │   ├── input.py
│       │   │   ├── keymap.py
│       │   │   └── selection_menu.py
│       │   ├── merge.py
│       │   ├── test.py
│       │   ├── to_fsdp2.py
│       │   ├── tpu.py
│       │   └── utils.py
│       ├── data_loader.py
│       ├── hooks.py
│       ├── inference.py
│       ├── launchers.py
│       ├── local_sgd.py
│       ├── logging.py
│       ├── memory_utils.py
│       ├── optimizer.py
│       ├── parallelism_config.py
│       ├── scheduler.py
│       ├── state.py
│       ├── test_utils/
│       │   ├── __init__.py
│       │   ├── examples.py
│       │   ├── scripts/
│       │   │   ├── __init__.py
│       │   │   ├── external_deps/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── test_checkpointing.py
│       │   │   │   ├── test_ds_alst_ulysses_sp.py
│       │   │   │   ├── test_ds_multiple_model.py
│       │   │   │   ├── test_metrics.py
│       │   │   │   ├── test_peak_memory_usage.py
│       │   │   │   ├── test_performance.py
│       │   │   │   ├── test_pippy.py
│       │   │   │   └── test_zero3_integration.py
│       │   │   ├── test_cli.py
│       │   │   ├── test_ddp_comm_hook.py
│       │   │   ├── test_distributed_data_loop.py
│       │   │   ├── test_merge_weights.py
│       │   │   ├── test_notebook.py
│       │   │   ├── test_ops.py
│       │   │   ├── test_script.py
│       │   │   └── test_sync.py
│       │   ├── testing.py
│       │   └── training.py
│       ├── tracking.py
│       └── utils/
│           ├── __init__.py
│           ├── ao.py
│           ├── bnb.py
│           ├── constants.py
│           ├── dataclasses.py
│           ├── deepspeed.py
│           ├── environment.py
│           ├── fsdp_utils.py
│           ├── imports.py
│           ├── launch.py
│           ├── megatron_lm.py
│           ├── memory.py
│           ├── modeling.py
│           ├── offload.py
│           ├── operations.py
│           ├── other.py
│           ├── random.py
│           ├── rich.py
│           ├── torch_xla.py
│           ├── tqdm.py
│           ├── transformer_engine.py
│           └── versions.py
├── tests/
│   ├── __init__.py
│   ├── deepspeed/
│   │   ├── ds_config_zero2.json
│   │   ├── ds_config_zero2_model_only.json
│   │   ├── ds_config_zero3.json
│   │   ├── ds_config_zero3_model_only.json
│   │   ├── test_alst_ulysses_sp.py
│   │   ├── test_deepspeed.py
│   │   ├── test_deepspeed_gradient_accumulation.py
│   │   └── test_deepspeed_multiple_model.py
│   ├── fsdp/
│   │   └── test_fsdp.py
│   ├── test_accelerator.py
│   ├── test_big_modeling.py
│   ├── test_cli.py
│   ├── test_compile.py
│   ├── test_configs/
│   │   ├── 0_11_0.yaml
│   │   ├── 0_12_0.yaml
│   │   ├── 0_28_0_mpi.yaml
│   │   ├── 0_30_0_sagemaker.yaml
│   │   ├── 0_34_0_fp8.yaml
│   │   ├── README.md
│   │   ├── invalid_keys.yaml
│   │   ├── latest.yaml
│   │   ├── latest_fsdp.yaml
│   │   └── validate_launch_cmd.yaml
│   ├── test_cpu.py
│   ├── test_data_loader.py
│   ├── test_dataclasses.py
│   ├── test_examples.py
│   ├── test_fp8.py
│   ├── test_grad_sync.py
│   ├── test_hooks.py
│   ├── test_imports.py
│   ├── test_kwargs_handlers.py
│   ├── test_launch.py
│   ├── test_load_checkpoint_and_dispatch_with_broadcast.py
│   ├── test_logging.py
│   ├── test_memory_utils.py
│   ├── test_metrics.py
│   ├── test_modeling_utils.py
│   ├── test_multidevice.py
│   ├── test_offload.py
│   ├── test_optimizer.py
│   ├── test_quantization.py
│   ├── test_sagemaker.py
│   ├── test_samples/
│   │   ├── MRPC/
│   │   │   ├── dev.csv
│   │   │   └── train.csv
│   │   └── test_command_file.sh
│   ├── test_scheduler.py
│   ├── test_state_checkpointing.py
│   ├── test_tpu.py
│   ├── test_tracking.py
│   ├── test_utils.py
│   ├── tp/
│   │   ├── fsdp2_tp_preparation.py
│   │   ├── fsdp2_tp_preparation_config.yaml
│   │   └── test_tp.py
│   └── xla_spawn.py
└── utils/
    ├── log_reports.py
    └── stale.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .devcontainer/devcontainer.json
================================================
// File only needed for VSCode users to have proper Docker based interpreters
{
    "name": "accelerate_dev_environment",
    "build": {
        // ACTION NEEDED: comment/uncomment the relevant line depending on whether you are in a CPU/GPU environment
         "dockerfile": "../docker/accelerate-cpu/Dockerfile"
//        "dockerfile": "../docker/accelerate-gpu/Dockerfile"
    },
    "runArgs": [
        // ACTION NEEDED: uncomment the next line if your local machine has GPUs available
//        "--gpus", "all",
        // Enable the docker container to access system resources
        "--ipc", "host"
    ],
    "remoteEnv": {
        "PYTHONPATH": "${containerEnv:PATH}:${containerWorkspaceFolder}"
    },
    "customizations": {
        "vscode": {
            "extensions": [
                // Ensure we have IntelliSense in VSCode when running inside container
                "ms-python.python"
            ]
        }
    },
    "workspaceFolder": "/workspaces/accelerate",
    // Need git for VSCode to color code modifications. Only runs when building environment.
    "onCreateCommand": "apt-get update && apt-get install -y git && pip install -e '.[dev]'"
}

================================================
FILE: .github/ISSUE_TEMPLATE/bug-report.yml
================================================
name: "\U0001F41B Bug Report"
description: Submit a bug report to help us improve Accelerate
body:
  - type: markdown
    attributes: 
      value: | 
        Thanks for taking the time to submit a bug report! 🐛 
        If this is not a bug related to the Accelerate library directly, but instead a general question about your code or the library specifically please use the [forums](https://discuss.huggingface.co/c/accelerate/18).

  - type: textarea
    id: system-info
    attributes:
      label: System Info
      description: Please share your accelerate configuration with us. You can run the command `accelerate env` and copy-paste its outputs below
      render: Shell
      placeholder: accelerate version, OS, python version, numpy version, torch version, and accelerate's configuration
    validations:
      required: true
  
  - type: checkboxes
    id: information-scripts-examples
    attributes:
      label: Information
      description: 'The problem arises when using:'
      options:
        - label: "The official example scripts"
        - label: "My own modified scripts"
  
  - type: checkboxes
    id: information-tasks
    attributes:
      label: Tasks
      description: "The tasks I am working on are:"
      options:
        - label: "One of the scripts in the examples/ folder of Accelerate or an officially supported `no_trainer` script in the `examples` folder of the `transformers` repo (such as `run_no_trainer_glue.py`)"
        - label: "My own task or dataset (give details below)"
  
  - type: textarea
    id: reproduction
    validations:
      required: true
    attributes:
      label: Reproduction
      description: |
        Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet.
        If you have code snippets, error messages, stack traces please provide them here as well.
        Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
        Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code.

      placeholder: |
        Steps to reproduce the behavior:
          
          1.
          2.
          3.

  - type: textarea
    id: expected-behavior
    validations:
      required: true
    attributes:
      label: Expected behavior
      description: "A clear and concise description of what you would expect to happen."


================================================
FILE: .github/PULL_REQUEST_TEMPLATE.md
================================================
# What does this PR do?

<!--
Congratulations! You've made it this far! You're not quite done yet though.

Once merged, your PR is going to appear in the release notes with the title you set, so make sure it's a great title that fully reflects the extent of your awesome contribution.

Then, please replace this with a description of the change and which issue is fixed (if applicable). Please also include relevant motivation and context. List any dependencies (if any) that are required for this change.

Once you're done, someone will review your PR shortly (see the section "Who can review?" below to tag some potential reviewers). They may suggest changes to make the code even better. If no one reviewed your PR after a week has passed, don't hesitate to post a new comment @-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case).
- [ ] Did you read the [contributor guideline](https://github.com/huggingface/accelerate/blob/main/CONTRIBUTING.md#submitting-a-pull-request-pr),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes? Here are the
      [documentation guidelines](https://github.com/huggingface/accelerate/tree/main/docs), and
      [here are tips on formatting docstrings](https://github.com/huggingface/accelerate/tree/main/docs#writing-documentation---specification).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the right person to tag with @

 If you know how to use git blame, that is the easiest way, otherwise, here is a rough guide of **who to tag**.

- Big modeling: @SunMarc
- Fully-Sharded Data Parallism: @SunMarc
- DeepSpeed: @SunMarc
- Command Line Interface: @SunMarc
- Documentation: @SunMarc
- Core parts of the library: @BenjaminBossan @SunMarc
- Maintained examples: @SunMarc

 -->

================================================
FILE: .github/workflows/build-docker-images-release.yml
================================================
name: Build Docker images (releases)

on:
  workflow_dispatch:
  release:
    types: [published]

concurrency:
  group: docker-image-builds
  cancel-in-progress: false

jobs:
  get-version:
    runs-on: ubuntu-latest
    outputs:
      version: ${{ steps.step1.outputs.version }}
    steps:
      - uses: actions/checkout@v6
      - id: step1
        run: echo "version=$(python setup.py --version)" >> $GITHUB_OUTPUT

  version-cpu:
    name: "Latest Accelerate CPU [version]"
    runs-on:
      group: aws-general-8-plus
    needs: get-version
    steps:
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Login to DockerHub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}

      - name: Build and Push CPU
        uses: docker/build-push-action@v6
        with:
          file: docker/accelerate-cpu/Dockerfile
          push: true
          tags: huggingface/accelerate:cpu-release-${{ needs.get-version.outputs.version }}

  version-cuda:
    name: "Latest Accelerate GPU [version]"
    runs-on:
      group: aws-g6-4xlarge-plus
    needs: get-version
    steps:
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Login to DockerHub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}

      - name: Build and Push GPU
        uses: docker/build-push-action@v6
        with:
          file: docker/accelerate-gpu/Dockerfile
          push: true
          tags: huggingface/accelerate:gpu-release-${{needs.get-version.outputs.version}}

  version-cuda-deepspeed:
    name: "Latest Accelerate GPU DeepSpeed [version]"
    runs-on:
      group: aws-g6-4xlarge-plus
    needs: get-version
    steps:
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Login to DockerHub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}

      - name: Build and Push GPU
        uses: docker/build-push-action@v6
        with:
          file: docker/accelerate-gpu-deepspeed/Dockerfile
          push: true
          tags: huggingface/accelerate:gpu-deepspeed-release-${{needs.get-version.outputs.version}}

  version-cuda-fp8-transformerengine:
    name: "Latest Accelerate GPU FP8 TransformerEngine [version]"
    runs-on:
      group: aws-g6-4xlarge-plus
    needs: get-version
    steps:
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Login to DockerHub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}

      - name: Build and Push GPU
        uses: docker/build-push-action@v6
        with:
          file: docker/accelerate-gpu/Dockerfile
          push: true
          tags: huggingface/accelerate:gpu-fp8-transformerengine-release-${{needs.get-version.outputs.version}}

================================================
FILE: .github/workflows/build_and_run_tests.yml
================================================
name: Trigger docker images and run tests

on:
  push:
    branches:
      - main
  workflow_dispatch:

env:
  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

jobs:
  check-for-source:
    runs-on: ubuntu-latest
    name: Check if setup was changed
    outputs:
      changed: ${{ steps.was_changed.outputs.changed }}
    steps:
      - uses: actions/checkout@v6
        with: 
          fetch-depth: "2"
      
      - name: Get changed files
        id: changed-files
        uses: tj-actions/changed-files@3f54ebb830831fc121d3263c1857cfbdc310cdb9 #v42
      
      - name: Was setup changed 
        id: was_changed
        run: |
          for file in ${{ steps.changed-files.outputs.all_changed_files }}; do
            if [ `basename "${file}"` == "setup.py" ]; then
              echo "changed=1" >> $GITHUB_OUTPUT
            fi
          done
          
  build-docker-containers:
    needs: check-for-source
    if: (github.event_name == 'push') && (needs.check-for-source.outputs.changed == '1')
    uses: ./.github/workflows/build_docker_images.yml
    secrets: inherit

  run-merge-tests:
    needs: build-docker-containers
    if: always()
    uses: ./.github/workflows/run_merge_tests.yml

  run-integration-tests:
    needs: build-docker-containers
    if: always()
    uses: ./.github/workflows/self_hosted_integration_tests.yml


================================================
FILE: .github/workflows/build_docker_images.yml
================================================
name: Build Docker images (scheduled)

on:
  workflow_dispatch:
  workflow_call:
  schedule:
    - cron: "0 1 * * *"

concurrency:
  group: docker-image-builds
  cancel-in-progress: false

jobs:
  latest-cpu:
    name: "Latest Accelerate CPU [dev]"
    runs-on:
      group: aws-general-8-plus
    steps:
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Login to DockerHub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}
      - name: Get current date
        id: date
        run: |
          echo "date=$(date '+%Y-%m-%d')" >> $GITHUB_ENV
      - name: Build and Push CPU
        uses: docker/build-push-action@v6
        with:
          file: docker/accelerate-cpu/Dockerfile
          push: true
          tags: |
            huggingface/accelerate:cpu-nightly
            huggingface/accelerate:cpu-nightly-${{ env.date }}

  latest-cuda:
    name: "Latest Accelerate GPU [dev]"
    runs-on:
      group: aws-g6-4xlarge-plus
    steps:
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Login to DockerHub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}
      - name: Get current date
        id: date
        run: |
          echo "date=$(date '+%Y-%m-%d')" >> $GITHUB_ENV
      - name: Build and Push GPU
        uses: docker/build-push-action@v6
        with:
          file: docker/accelerate-gpu/Dockerfile
          push: true
          tags: |
            huggingface/accelerate:gpu-nightly
            huggingface/accelerate:gpu-nightly-${{ env.date }}

  latest-cuda-deepspeed:
    name: "Latest Accelerate GPU DeepSpeed [dev]"
    runs-on:
      group: aws-g6-4xlarge-plus
    steps:
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Login to DockerHub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}
      - name: Get current date
        id: date
        run: |
          echo "date=$(date '+%Y-%m-%d')" >> $GITHUB_ENV
      - name: Build and Push GPU
        uses: docker/build-push-action@v6
        with:
          file: docker/accelerate-gpu-deepspeed/Dockerfile
          push: true
          tags: |
            huggingface/accelerate:gpu-deepspeed-nightly
            huggingface/accelerate:gpu-deepspeed-nightly-${{ env.date }}

  latest-cuda-fp8-transformerengine:
    name: "Latest Accelerate GPU FP8 TransformerEngine [dev]"
    runs-on:
      group: aws-g6-4xlarge-plus
    steps:
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Login to DockerHub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}
      - name: Get current date
        id: date
        run: |
          echo "date=$(date '+%Y-%m-%d')" >> $GITHUB_ENV
          # Get the previous month
          echo "base_year=$(date -d 'last month' '+%y')" >> $GITHUB_ENV
          echo "base_month=$(date -d 'last month' '+%m')" >> $GITHUB_ENV
      - name: Build and Push GPU
        uses: docker/build-push-action@v6
        with:
          file: benchmarks/fp8/transformer_engine/Dockerfile
          push: true
          tags: huggingface/accelerate:gpu-fp8-transformerengine-nightly-${{ env.date }}
          build-args: |
            BASE_YEAR=${{ env.base_year }}
            BASE_MONTH=${{ env.base_month }}

================================================
FILE: .github/workflows/build_documentation.yml
================================================
name: Build documentation

on:
  push:
    branches:
      - main
      - doc-builder*
      - v*-release

jobs:
   build:
    uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
    with:
      commit_sha: ${{ github.sha }}
      package: accelerate
      custom_container: huggingface/transformers-doc-builder
    secrets:
      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}


================================================
FILE: .github/workflows/build_pr_documentation.yml
================================================
name: Build PR Documentation

on:
  pull_request:

concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true

jobs:
  build:
    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
    with:
      commit_sha: ${{ github.event.pull_request.head.sha }}
      pr_number: ${{ github.event.number }}
      package: accelerate
      custom_container: huggingface/transformers-doc-builder


================================================
FILE: .github/workflows/fp8_runner.yml
================================================
name: Test FP8 Runner

on:
  workflow_dispatch:

env:
  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
jobs:
  set-prev-day:
    runs-on: ubuntu-latest
    outputs:
      prev-day: ${{ steps.set-prev-day.outputs.prev-day }}
    steps:
      - name: Set PREV_DAY
        id: set-prev-day
        run: |
          PREV_DAY=$(date -d "yesterday" '+%Y-%m-%d')
          echo "prev-day=$PREV_DAY" >> $GITHUB_OUTPUT
  run-fp8-tests:
    needs: set-prev-day
    runs-on:
      group: aws-g6e-12xlarge
    container:
      image: huggingface/accelerate:gpu-fp8-transformerengine-nightly-${{ needs.set-prev-day.outputs.prev-day }}
      options: --gpus all --shm-size "16gb"
    steps:
      - uses: actions/checkout@v6
      - name: Install the library
        run: |
            pip install -e .[test_prod,test_fp8]
      - name: Show installed libraries
        run: |
          pip freeze
      - name: Run TE FP8 tests
        run: |
          python -m pytest -s -v ./tests/test_fp8.py


================================================
FILE: .github/workflows/gaudi3_scheduled.yml
================================================
name: Gaudi3 tests (scheduled)

on:
  workflow_dispatch:
  schedule: # every day at 6 AM UTC
    - cron: "0 6 * * *"

concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true

jobs:
  run-gaudi3-tests:
    runs-on:
      group: itac-bm-emr-gaudi3-dell-2gaudi

    container:
      image: docker://vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
      options: --runtime=habana --shm-size=64G --cap-add=sys_nice --env HABANA_VISIBLE_DEVICES
      env:
        OMPI_MCA_btl_vader_single_copy_mechanism: none
        PT_ENABLE_INT64_SUPPORT: 1
        PT_HPU_LAZY_MODE: 0
        RUN_SLOW: 1

    steps:
      - name: HL-SMI (1)
        run: |
          hl-smi
          echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
          echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"

      - name: Extract HPU visible modules
        id: add-modules
        run: |
          export HABANA_VISIBLE_MODULES=$(hl-smi -Q module_id -f csv,noheader | tr '\n' ',' | sed 's/,$//')
          echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}" >> $GITHUB_ENV

      - name: HL-SMI (2)
        run: |
          hl-smi
          echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
          echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"

      - name: Checkout to Accelerate
        uses: actions/checkout@v6

      - name: Install Accelerate with Transformers & DeepSpeed
        run: |
          pip install -e .[testing] \
            git+https://github.com/HabanaAI/DeepSpeed.git@1.20.0 \
            git+https://github.com/huggingface/transformers.git

      - name: Run CLI tests
        if: ${{ !cancelled() && (success() || failure()) }}
        run: |
          make test_cli

      - name: Run Core tests
        if: ${{ !cancelled() && (success() || failure()) }}
        run: |
          make test_core

      - name: Run Big Modeling tests
        if: ${{ !cancelled() && (success() || failure()) }}
        run: |
          make test_big_modeling

      - name: Run DeepSpeed integration tests
        if: ${{ !cancelled() && (success() || failure()) }}
        run: |
          make test_deepspeed

      - name: Run FSDP integration tests
        if: ${{ !cancelled() && (success() || failure()) }}
        run: |
          make test_fsdp

      - name: Run TP integration tests
        if: ${{ !cancelled() && (success() || failure()) }}
        run: |
          make test_tp

      - name: Run Examples tests
        if: ${{ !cancelled() && (success() || failure()) }}
        run: |
          make test_examples


================================================
FILE: .github/workflows/integration_tests.yml
================================================
# CI for specifically ensuring integrations work fine (`transformers` mainly)
# Useful tips:
#  - New integrations to test should have its own job, and follow a strategy method where we check both
#    the pypi and github versions.
#  - When checking the latest release of the integration, use
#    git checkout $(git describe --tags `git rev-list --tags --max-count=1`) to get the latest release.

name: Integration Tests

on:
  pull_request:
    paths:
      - "src/**"
      - "tests/**"
      - ".github/**"
      - "examples/**"
      - "setup.py"
    types: [opened, synchronize, reopened]

env:
  HF_HOME: ~/hf_cache

jobs:
  run-trainer-tests:
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
    steps:
    - uses: actions/checkout@v6
    - name: Set up python 3.10
      uses: actions/setup-python@v6
      with:
        python-version: '3.10'
        cache: 'pip'
        cache-dependency-path: 'setup.py'

    - name: Install Accelerate from source
      run: |
        pip install --upgrade pip
        pip install -e .
    
    - name: Clone and install transformers
      run: |
        cd ..
        git clone https://github.com/huggingface/transformers
        cd transformers
        pip install .[torch,testing]

    - name: Show installed libraries
      run: |
        pip freeze

    - name: Run Trainer tests
      env:
        WANDB_DISABLED: true
      run: |
        cd ../transformers
        pytest -sv tests/trainer


================================================
FILE: .github/workflows/nightly.yml
================================================
name: Self-hosted runner with slow tests (scheduled)

on:
  workflow_dispatch:
  schedule:
    - cron: "0 2 * * *"

env:
  RUN_SLOW: "yes"
  IS_GITHUB_CI: "1"
  SLACK_API_TOKEN: ${{ secrets.SLACK_API_TOKEN }}


jobs:
  run_core_tests_single_gpu:
    runs-on:
      group: aws-g6-4xlarge-plus
    env:
      CUDA_VISIBLE_DEVICES: "0"
      TEST_TYPE: "single_gpu"
    container:
      image: huggingface/accelerate:gpu-nightly
      options: --gpus all --shm-size "16gb"
    defaults:
      run:
        shell: bash
    steps:
      - name: Update clone & pip install
        run: |
          source activate accelerate
          git clone https://github.com/huggingface/accelerate;
          cd accelerate;
          git checkout ${{ github.sha }};
          pip install -e . --no-deps
          pip install pytest-reportlog tabulate

      - name: Show installed libraries
        run: |
          source activate accelerate;
          pip freeze

      - name: Run test on GPUs
        working-directory: accelerate
        run: |
          source activate accelerate
          make test

      - name: Run examples on GPUs
        working-directory: accelerate
        if: always()
        run: |
          source activate accelerate
          pip uninstall comet_ml -y
          make test_examples

      - name: Generate Report
        working-directory: accelerate
        if: always()
        run: |
          pip install slack_sdk tabulate
          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY

  run_deepspeed_tests_single_gpu:
    runs-on:
      group: aws-g6-4xlarge-plus
    env:
      CUDA_VISIBLE_DEVICES: "0"
      TEST_TYPE: "single_gpu_deepspeed"
    container:
      image: huggingface/accelerate:gpu-deepspeed-nightly
      options: --gpus all --shm-size "16gb"
    defaults:
      run:
        shell: bash
    steps:
      - name: Update clone & pip install
        run: |
          source activate accelerate
          git clone https://github.com/huggingface/accelerate;
          cd accelerate;
          git checkout ${{ github.sha }};
          pip install -e . --no-deps
          pip install pytest-reportlog tabulate

      - name: Show installed libraries
        run: |
          source activate accelerate;
          pip freeze

      - name: Run test on GPUs
        working-directory: accelerate
        run: |
          source activate accelerate
          make test_deepspeed

      - name: Run Integration tests on GPUs
        working-directory: accelerate
        if: always()
        run: |
          source activate accelerate
          make test_integrations

      - name: Run examples on GPUs
        working-directory: accelerate
        if: always()
        run: |
          source activate accelerate
          pip uninstall comet_ml -y
          make test_examples

      - name: Generate Report
        working-directory: accelerate
        if: always()
        run: |
          pip install slack_sdk tabulate
          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY

  run_core_tests_multi_gpu:
    runs-on:
      group: aws-g6-12xlarge-plus
    env:
      CUDA_VISIBLE_DEVICES: "0,1"
      TEST_TYPE: "multi_gpu"
    container:
      image: huggingface/accelerate:gpu-nightly
      options: --gpus all --shm-size "16gb"
    defaults:
      run:
        shell: bash
    steps:
      - name: Update clone
        run: |
          source activate accelerate
          git clone https://github.com/huggingface/accelerate;
          cd accelerate;
          git checkout ${{ github.sha }};
          pip install -e . --no-deps
          pip install pytest-reportlog tabulate

      - name: Show installed libraries
        run: |
          source activate accelerate;
          pip freeze

      - name: Run core and big modeling tests on GPUs
        working-directory: accelerate
        run: |
          source activate accelerate
          make test_core
          make test_big_modeling
          make test_cli

      - name: Run Integration tests on GPUs
        working-directory: accelerate
        if: always()
        run: |
          source activate accelerate
          make test_integrations

      - name: Run examples on GPUs
        working-directory: accelerate
        if: always()
        run: |
          source activate accelerate
          pip uninstall comet_ml -y
          make test_examples

      - name: Generate Report
        working-directory: accelerate
        if: always()
        run: |
          pip install slack_sdk tabulate
          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY

  run_deepspeed_tests_multi_gpu:
    runs-on:
      group: aws-g6-12xlarge-plus
    env:
      CUDA_VISIBLE_DEVICES: "0,1"
      TEST_TYPE: "multi_gpu_deepspeed"
    container:
      image: huggingface/accelerate:gpu-deepspeed-nightly
      options: --gpus all --shm-size "16gb"
    defaults:
      run:
        shell: bash
    steps:
      - name: Update clone
        run: |
          source activate accelerate
          git clone https://github.com/huggingface/accelerate;
          cd accelerate;
          git checkout ${{ github.sha }};
          pip install -e . --no-deps
          pip install pytest-reportlog tabulate

      - name: Show installed libraries
        run: |
          source activate accelerate;
          pip freeze

      - name: Run DeepSpeed tests
        working-directory: accelerate
        run: |
          source activate accelerate
          make test_deepspeed

      - name: Run Integration tests on GPUs
        working-directory: accelerate
        if: always()
        run: |
          source activate accelerate
          make test_integrations

      - name: Run examples on GPUs
        working-directory: accelerate
        if: always()
        run: |
          source activate accelerate
          pip uninstall comet_ml -y
          make test_examples

      - name: Generate Report
        working-directory: accelerate
        if: always()
        run: |
          pip install slack_sdk tabulate
          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY


  run-integration-tests:
    if: always()
    uses: ./.github/workflows/self_hosted_integration_tests.yml


================================================
FILE: .github/workflows/pr_style_bot.yml
================================================
# To run this bot, comment "@bot /style" on a PR
name: Style Bot

on:
  issue_comment:
    types: [created]

permissions:
  contents: write
  pull-requests: write

jobs:
  style:
    uses: huggingface/huggingface_hub/.github/workflows/style-bot-action.yml@main
    with:
      python_quality_dependencies: "[quality]"
      style_command_type: "default"
    secrets:
      bot_token: ${{ secrets.GITHUB_TOKEN }}

================================================
FILE: .github/workflows/quality.yml
================================================
name: Quality Check

on: [pull_request]

jobs:
  quality:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v6
    - name: Set up Python 3.10
      uses: actions/setup-python@v6
      with:
        python-version: '3.10'
        cache: 'pip'
        cache-dependency-path: 'setup.py'
    - name: Install Python dependencies
      run: pip install -e .[quality]
    - name: Run Quality check
      run: make quality
    - name: Check if failure
      if: ${{ failure() }}
      run: |
        echo "Quality check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and rerun 'make style; make quality;'" >> $GITHUB_STEP_SUMMARY


================================================
FILE: .github/workflows/run_merge_tests.yml
================================================
name: Self-hosted runner tests (push to "main")

on:
  workflow_call:
  workflow_dispatch:

env:
  TESTING_MOCKED_DATALOADERS: "1"
  IS_GITHUB_CI: "1"

jobs:
  run_core_tests_single_gpu:
    runs-on:
      group: aws-g6-4xlarge-plus
    env:
      CUDA_VISIBLE_DEVICES: "0"
    container:
      image: huggingface/accelerate:gpu-nightly
      options: --gpus all --shm-size "16gb"
    defaults:
      run:
        shell: bash
    steps:
      - name: Install accelerate
        run: |
          source activate accelerate;
          git clone https://github.com/huggingface/accelerate;
          cd accelerate;
          git checkout ${{ github.sha }};
          pip install -e .[testing,test_trackers] -U;
          pip install pytest-reportlog tabulate  ;

      - name: Show installed libraries
        run: |
          source activate accelerate;
          pip freeze

      - name: Run CLI tests (use make cli)
        working-directory: accelerate
        run: |
          source activate accelerate;
          make test_cli

      - name: Run test on GPUs
        working-directory: accelerate
        if: always()
        run: |
          source activate accelerate;
          make test
      - name: Run examples on GPUs
        working-directory: accelerate
        if: always()
        run: |
          source activate accelerate;
          pip uninstall comet_ml -y;
          make test_examples

      - name: Generate Report
        working-directory: accelerate
        if: always()
        run: |
          pip install tabulate;
          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY

  run_deepspeed_tests_single_gpu:
    runs-on:
      group: aws-g6-4xlarge-plus
    env:
      CUDA_VISIBLE_DEVICES: "0"
    container:
      image: huggingface/accelerate:gpu-deepspeed-nightly
      options: --gpus all --shm-size "16gb"
    defaults:
      run:
        shell: bash
    steps:
      - name: Install accelerate
        run: |
          source activate accelerate;
          git clone https://github.com/huggingface/accelerate;
          cd accelerate;
          git checkout ${{ github.sha }};
          pip install -e .[testing,test_trackers] -U;
          pip install pytest-reportlog tabulate  ;

      - name: Show installed libraries
        run: |
          source activate accelerate;
          pip freeze

      - name: Run test on GPUs
        working-directory: accelerate
        if: always()
        run: |
          source activate accelerate;
          make test_deepspeed

      - name: Generate Report
        working-directory: accelerate
        if: always()
        run: |
          pip install tabulate;
          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY

  run_core_tests_multi_gpu:
    runs-on:
      group: aws-g6-12xlarge-plus
    env:
      CUDA_VISIBLE_DEVICES: 0,1
    container:
      image: huggingface/accelerate:gpu-nightly
      options: --gpus all --shm-size "16gb"
    defaults:
      run:
        shell: bash
    steps:
      - name: Update clone
        run: |
          source activate accelerate;
          git clone https://github.com/huggingface/accelerate;
          cd accelerate;
          git checkout ${{ github.sha }};
          pip install -e .[testing,test_trackers] -U;
          pip install pytest-reportlog tabulate

      - name: Show installed libraries
        run: |
          source activate accelerate;
          pip freeze

      - name: Run test on GPUs
        working-directory: accelerate
        run: |
          source activate accelerate;
          make test

      - name: Run examples on GPUs
        working-directory: accelerate
        if: always()
        run: |
          source activate accelerate;
          pip uninstall comet_ml -y;
          make test_examples

      - name: Generate Report
        working-directory: accelerate
        if: always()
        run: |
          source activate accelerate;
          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY

  run_deepspeed_tests_multi_gpu:
    runs-on:
      group: aws-g6-12xlarge-plus
    container:
      image: huggingface/accelerate:gpu-deepspeed-nightly
      options: --gpus all --shm-size "16gb"
    defaults:
      run:
        shell: bash
    steps:
      - name: Install accelerate
        run: |
          source activate accelerate;
          git clone https://github.com/huggingface/accelerate;
          cd accelerate;
          git checkout ${{ github.sha }};
          pip install -e .[testing,test_trackers] -U;
          pip install pytest-reportlog tabulate  ;

      - name: Show installed libraries
        run: |
          source activate accelerate;
          pip freeze

      - name: Run test on GPUs
        working-directory: accelerate
        if: always()
        run: |
          source activate accelerate;
          make test_deepspeed

      - name: Generate Report
        working-directory: accelerate
        if: always()
        run: |
          pip install tabulate;
          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY


================================================
FILE: .github/workflows/self_hosted_integration_tests.yml
================================================
# CI for specifically ensuring integrations work fine (`transformers` mainly) on GPUs
# Useful tips:
#  - `working-directory` should be set to the root of the repo, which is cloned on the actual CI runner.
#    It follows the directory structure of `actions-runner/_work/{repo_name}/{repo_name}/{cloned_repo} on
#    prem, but in Actions setting `working-directory` looks just in the `{repo_name}` level.
#  - New integrations to test should have its own job, and follow a strategy method where we check both
#    the pypi and github versions.
#  - Workflow call lets this be called from `build_and_run_tests.yml`
#  - When using a docker container, it's recommended to set `--shm-size`, we use 16gb.
name: Integration Tests (push to "main")

on:
  workflow_call:
  workflow_dispatch:

env:
  HF_HOME: ~/hf_cache

defaults:
  run:
    shell: bash

jobs:
  run-trainer-tests:
    container:
      image: huggingface/accelerate:gpu-deepspeed-nightly
      options: --gpus all --shm-size "16gb"
    runs-on:
      group: aws-g6-12xlarge-plus
    strategy:
      fail-fast: false
      matrix:
        cuda_visible_devices: [
          "0",
          "0,1"
        ]
    steps:
      - name: Install transformers
        run: |
          source activate accelerate;
          git clone https://github.com/huggingface/transformers --depth 1;
          cd transformers;
          pip install .[torch,deepspeed-testing];
          cd ..;

      - name: Install accelerate
        run: |
          source activate accelerate;
          git clone https://github.com/huggingface/accelerate;
          cd accelerate;
          git checkout ${{ github.sha }} ;
          pip install -e .[testing];
          pip uninstall comet_ml wandb dvclive -y
          cd ..;

      - name: Show installed libraries
        run: |
          source activate accelerate;
          pip freeze

      - name: Run trainer tests
        working-directory: transformers/
        env:
          CUDA_VISIBLE_DEVICES: ${{ matrix.cuda_visible_devices }}
          WANDB_DISABLED: true
        run: |
          source activate accelerate;
          pytest -sv tests/trainer

      - name: Run deepspeed tests
        working-directory: transformers/
        env:
          CUDA_VISIBLE_DEVICES: ${{ matrix.cuda_visible_devices }}
          WANDB_DISABLED: true
        if: always()
        run: |
          source activate accelerate;
          pytest -sv tests/deepspeed

      - name: Run transformers examples tests
        working-directory: transformers/
        env:
          CUDA_VISIBLE_DEVICES: ${{ matrix.cuda_visible_devices }}
          WANDB_DISABLED: true
        run: |
          source activate accelerate
          pip install -r examples/pytorch/_tests_requirements.txt
          pytest -sv examples/pytorch/test_accelerate_examples.py examples/pytorch/test_pytorch_examples.py

  run-skorch-tests:
    container:
      image: huggingface/accelerate:gpu-nightly
      options: --gpus all --shm-size "16gb"
    runs-on:
      group: aws-g6-12xlarge-plus
    strategy:
      fail-fast: false
    steps:
      - name: Install accelerate
        run:
          source activate accelerate;
          git clone https://github.com/huggingface/accelerate;
          cd accelerate;
          git checkout ${{ github.sha }};
          pip install -e .[testing];
          cd ..

      - name: Install skorch
        run: |
          source activate accelerate
          git clone https://github.com/skorch-dev/skorch;
          cd skorch;
          git config --global --add safe.directory '*'
          git checkout master && git pull
          pip install .[test]
          pip install flaky

      - name: Show installed libraries
        run: |
          source activate accelerate;
          pip freeze

      - name: Run skorch tests
        working-directory: skorch/
        run: |
          source activate accelerate;
          pytest -sv -k TestAccelerate


================================================
FILE: .github/workflows/stale.yml
================================================
name: Stale Bot

on:
  schedule:
    - cron: "0 15 * * *"
  workflow_dispatch:

jobs:
  close_stale_issues:
    name: Close Stale Issues
    if: github.repository == 'huggingface/accelerate'
    runs-on: ubuntu-latest
    permissions:
      issues: write
      pull-requests: write
    env:
      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
    steps:
    - uses: actions/checkout@v6
    
    - name: Setup Python
      uses: actions/setup-python@v6
      with:
        python-version: '3.10'
        cache: 'pip'
        cache-dependency-path: 'setup.py'
    
    - name: Install requirements
      run: |
        pip install PyGithub
    - name: Close stale issues
      run: |
        python utils/stale.py


================================================
FILE: .github/workflows/test.yml
================================================
name: Run Tests

on:
  pull_request:
    paths:
      - "src/**"
      - "tests/**"
      - ".github/**"
      - "examples/**"
      - "setup.py"
    types: [opened, synchronize, reopened]

env:
  HF_HOME: ~/hf_cache
  TESTING_MOCKED_DATALOADERS: "1"
  IS_GITHUB_CI: "1"

jobs:
  run-tests:
    runs-on:
      group: aws-general-8-plus
    strategy:
      fail-fast: false
      matrix:
        pytorch-version: [
          latest,
          minimum,
        ]
        test-kind: [
          test_prod,
          test_core,
          test_cli,
          test_big_modeling,
          test_deepspeed,
          test_fsdp,
          test_example_differences,
          test_checkpoint_step,
          test_checkpoint_epoch,
          test_rest
        ]
    steps:
    - uses: actions/checkout@v6
    - name: Set up python 3.10
      uses: actions/setup-python@v6
      with:
        python-version: '3.10'
        cache: 'pip'
        cache-dependency-path: 'setup.py'
    
    - name: Install the library
      run: |
        if [[ ${{ matrix.test-kind }} = test_prod ]]; then pip install -e .[test_prod]; fi
        if [[ ${{ matrix.test-kind }} != test_prod ]]; then pip install -e .[testing,test_trackers]; fi
        if [[ ${{ matrix.test-kind }} = test_rest ]]; then pip uninstall comet_ml -y; fi
        if [[ ${{ matrix.pytorch-version }} = minimum ]]; then pip install torchvision==0.19.0 torch==2.4.0; fi
        pip install pytest-reportlog tabulate setuptools importlib_metadata

    - name: Show installed libraries
      run: |
        pip freeze
    
    - name: Run Tests
      env: 
        PYTORCH_VERSION: ${{ matrix.pytorch-version }}
      run: |
        make ${{ matrix.test-kind }}

    - name: Generate Report
      if: always()
      run: |
        python utils/log_reports.py >> $GITHUB_STEP_SUMMARY


================================================
FILE: .github/workflows/test_imports.yml
================================================
name: Run Import Tests

on:
  pull_request:
    paths:
      - "src/**"
      - "tests/**"
      - ".github/**"
      - "examples/**"
      - "setup.py"
    types: [opened, synchronize, reopened]

env:
  HF_HOME: ~/hf_cache
  TESTING_MOCKED_DATALOADERS: "1"
  IS_GITHUB_CI: "1"

jobs:
  run-tests:
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        pytorch-version: [
          latest,
          minimum,
        ]
    steps:
    - uses: actions/checkout@v6
    - name: Set up python 3.10
      uses: actions/setup-python@v6
      with:
        python-version: '3.10'
        cache: 'pip'
        cache-dependency-path: 'setup.py'
    
    - name: Install the library
      run: |
        pip install -e .
        pip install pytest-reportlog tabulate setuptools git+https://github.com/muellerzr/import-timer

    - name: Show installed libraries
      run: |
        pip freeze
    
    - name: Run Import Tests
      env: 
        PYTORCH_VERSION: ${{ matrix.pytorch-version }}
      run: |
        pytest -sv tests/test_imports.py

    - name: Generate Report
      if: always()
      run: |
        python utils/log_reports.py >> $GITHUB_STEP_SUMMARY


================================================
FILE: .github/workflows/trufflehog.yml
================================================
on:
  push:

name: Secret Leaks

jobs:
  trufflehog:
    runs-on: ubuntu-latest
    steps:
    - name: Checkout code
      uses: actions/checkout@v6
      with:
        fetch-depth: 0
    - name: Secret Scanning
      uses: trufflesecurity/trufflehog@main


================================================
FILE: .github/workflows/upload_pr_documentation.yml
================================================
name: Upload PR Documentation

on:
  workflow_run:
    workflows: ["Build PR Documentation"]
    types:
      - completed

jobs:
  build:
    uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main
    with:
      package_name: accelerate
    secrets:
      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
      comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}

================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# VSCode
.vscode

# IntelliJ
.idea

# Mac .DS_Store
.DS_Store

# More test things
wandb

# ruff
.ruff_cache


================================================
FILE: .pre-commit-config.yaml
================================================
repos:
  - repo: https://github.com/astral-sh/ruff-pre-commit
    rev: v0.2.1
    hooks:
      - id: ruff
        args:
          - --fix
      - id: ruff-format
  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v4.5.0
    hooks:
      - id: check-merge-conflict
      - id: check-yaml


================================================
FILE: CODE_OF_CONDUCT.md
================================================

# Contributor Covenant Code of Conduct

## Our Pledge

We as members, contributors, and leaders pledge to make participation in our
community a harassment-free experience for everyone, regardless of age, body
size, visible or invisible disability, ethnicity, sex characteristics, gender
identity and expression, level of experience, education, socio-economic status,
nationality, personal appearance, race, religion, or sexual identity
and orientation.

We pledge to act and interact in ways that contribute to an open, welcoming,
diverse, inclusive, and healthy community.

## Our Standards

Examples of behavior that contributes to a positive environment for our
community include:

* Demonstrating empathy and kindness toward other people
* Being respectful of differing opinions, viewpoints, and experiences
* Giving and gracefully accepting constructive feedback
* Accepting responsibility and apologizing to those affected by our mistakes,
  and learning from the experience
* Focusing on what is best not just for us as individuals, but for the
  overall community

Examples of unacceptable behavior include:

* The use of sexualized language or imagery, and sexual attention or
  advances of any kind
* Trolling, insulting or derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or email
  address, without their explicit permission
* Other conduct which could reasonably be considered inappropriate in a
  professional setting

## Enforcement Responsibilities

Community leaders are responsible for clarifying and enforcing our standards of
acceptable behavior and will take appropriate and fair corrective action in
response to any behavior that they deem inappropriate, threatening, offensive,
or harmful.

Community leaders have the right and responsibility to remove, edit, or reject
comments, commits, code, wiki edits, issues, and other contributions that are
not aligned to this Code of Conduct, and will communicate reasons for moderation
decisions when appropriate.

## Scope

This Code of Conduct applies within all community spaces, and also applies when
an individual is officially representing the community in public spaces.
Examples of representing our community include using an official e-mail address,
posting via an official social media account, or acting as an appointed
representative at an online or offline event.

## Enforcement

Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported to the community leaders responsible for enforcement at
feedback@huggingface.co.
All complaints will be reviewed and investigated promptly and fairly.

All community leaders are obligated to respect the privacy and security of the
reporter of any incident.

## Enforcement Guidelines

Community leaders will follow these Community Impact Guidelines in determining
the consequences for any action they deem in violation of this Code of Conduct:

### 1. Correction

**Community Impact**: Use of inappropriate language or other behavior deemed
unprofessional or unwelcome in the community.

**Consequence**: A private, written warning from community leaders, providing
clarity around the nature of the violation and an explanation of why the
behavior was inappropriate. A public apology may be requested.

### 2. Warning

**Community Impact**: A violation through a single incident or series
of actions.

**Consequence**: A warning with consequences for continued behavior. No
interaction with the people involved, including unsolicited interaction with
those enforcing the Code of Conduct, for a specified period of time. This
includes avoiding interactions in community spaces as well as external channels
like social media. Violating these terms may lead to a temporary or
permanent ban.

### 3. Temporary Ban

**Community Impact**: A serious violation of community standards, including
sustained inappropriate behavior.

**Consequence**: A temporary ban from any sort of interaction or public
communication with the community for a specified period of time. No public or
private interaction with the people involved, including unsolicited interaction
with those enforcing the Code of Conduct, is allowed during this period.
Violating these terms may lead to a permanent ban.

### 4. Permanent Ban

**Community Impact**: Demonstrating a pattern of violation of community
standards, including sustained inappropriate behavior,  harassment of an
individual, or aggression toward or disparagement of classes of individuals.

**Consequence**: A permanent ban from any sort of public interaction within
the community.

## Attribution

This Code of Conduct is adapted from the [Contributor Covenant][homepage],
version 2.0, available at
https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.

Community Impact Guidelines were inspired by [Mozilla's code of conduct
enforcement ladder](https://github.com/mozilla/diversity).

[homepage]: https://www.contributor-covenant.org

For answers to common questions about this code of conduct, see the FAQ at
https://www.contributor-covenant.org/faq. Translations are available at
https://www.contributor-covenant.org/translations.


================================================
FILE: CONTRIBUTING.md
================================================
<!---
Copyright 2022 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

# How to contribute to 🤗 Accelerate?

Everyone is welcome to contribute, and we value everybody's contribution. Code
is thus not the only way to help the community. Answering questions, helping
others, reaching out and improving the documentations are immensely valuable to
the community.

It also helps us if you spread the word: reference the library from blog posts
on the awesome projects it made possible, shout out on Twitter every time it has
helped you, or simply star the repo to say "thank you".

Whichever way you choose to contribute, please be mindful to respect our
[code of conduct](https://github.com/huggingface/accelerate/blob/main/CODE_OF_CONDUCT.md).

## You can contribute in so many ways!

Some of the ways you can contribute to Accelerate:
* Fixing outstanding issues with the existing code;
* Contributing to the examples or to the documentation;
* Submitting issues related to bugs or desired new features.

## Submitting a new issue or feature request

Do your best to follow these guidelines when submitting an issue or a feature
request. It will make it easier for us to come back to you quickly and with good
feedback.

### Did you find a bug?

The 🤗 Accelerate library is robust and reliable thanks to the users who notify us of
the problems they encounter. So thank you for reporting an issue.

First, we would really appreciate it if you could **make sure the bug was not
already reported** (use the search bar on Github under Issues).

Did not find it? :( So we can act quickly on it, please follow these steps:

* Include your **OS type and version**, the versions of **Python** and **PyTorch**.
* A short, self-contained, code snippet that allows us to reproduce the bug in
  less than 30s;
* Provide the with your Accelerate configuration (located by default in `~/.cache/huggingface/accelerate/default_config.yaml`)

### Do you want a new feature?

A good feature request addresses the following points:

1. Motivation first:
* Is it related to a problem/frustration with the library? If so, please explain
  why. Providing a code snippet that demonstrates the problem is best.
* Is it related to something you would need for a project? We'd love to hear
  about it!
* Is it something you worked on and think could benefit the community?
  Awesome! Tell us what problem it solved for you.
2. Write a *full paragraph* describing the feature;
3. Provide a **code snippet** that demonstrates its future use;
4. In case this is related to a paper, please attach a link;
5. Attach any additional information (drawings, screenshots, etc.) you think may help.

If your issue is well written we're already 80% of the way there by the time you
post it.

## Submitting a pull request (PR)

Before writing code, we strongly advise you to search through the existing PRs or
issues to make sure that nobody is already working on the same thing. If you are
unsure, it is always a good idea to open an issue to get some feedback.

You will need basic `git` proficiency to be able to contribute to
🤗 Accelerate. `git` is not the easiest tool to use but it has the greatest
manual. Type `git --help` in a shell and enjoy. If you prefer books, [Pro
Git](https://git-scm.com/book/en/v2) is a very good reference.

Follow these steps to start contributing:

1. Fork the [repository](https://github.com/huggingface/accelerate) by
   clicking on the 'Fork' button on the repository's page. This creates a copy of the code
   under your GitHub user account.

2. Clone your fork to your local disk, and add the base repository as a remote. The following command
   assumes you have your public SSH key uploaded to GitHub. See the following guide for more
   [information](https://docs.github.com/en/repositories/creating-and-managing-repositories/cloning-a-repository).

   ```bash
   $ git clone git@github.com:<your Github handle>/accelerate.git
   $ cd accelerate
   $ git remote add upstream https://github.com/huggingface/accelerate.git
   ```

3. Create a new branch to hold your development changes, and do this for every new PR you work on.

   Start by synchronizing your `main` branch with the `upstream/main` branch (ore details in the [GitHub Docs](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/syncing-a-fork)):

   ```bash
   $ git checkout main
   $ git fetch upstream
   $ git merge upstream/main
   ```

   Once your `main` branch is synchronized, create a new branch from it:

   ```bash
   $ git checkout -b a-descriptive-name-for-my-changes
   ```

   **Do not** work on the `main` branch.

4. Set up a development environment by running the following command in a conda or a virtual environment you've created for working on this library:

   ```bash
   $ pip install -e ".[dev]"
   ```
   
   This will install all testing and linting/code quality dependencies for the library (see `quality`, `test_dev`, 
   `test_prod` targets in [`setup.py`](./setup.py)).

   (If accelerate was already installed in the virtual environment, remove
   it with `pip uninstall accelerate` before reinstalling it in editable
   mode with the `-e` flag).

   Alternatively, if you are using [Visual Studio Code](https://code.visualstudio.com/Download), the fastest way to get set up is by using
   the provided Dev Container. Documentation on how to get started with dev containers is available [here](https://code.visualstudio.com/docs/remote/containers).

5. Develop the features on your branch.

   As you work on the features, you should make sure that the test suite
   passes. You should run the tests impacted by your changes like this (see 
   below an explanation regarding the environment variable):

   ```bash
   $ pytest tests/<TEST_TO_RUN>.py
   ```
   
   > For the following commands leveraging the `make` utility, we recommend using the WSL system when running on
   > Windows. More information [here](https://docs.microsoft.com/en-us/windows/wsl/about).

   You can also run the full suite with the following command.

   ```bash
   $ make test
   ```

   `accelerate` relies on `ruff` to format its source code
   consistently. After you make changes, apply automatic style corrections and code verifications
   that can't be automated in one go with:

   This target is also optimized to only work with files modified by the PR you're working on.

   If you prefer to run the checks one after the other, the following command apply the
   style corrections:

   ```bash
   $ make style
   ```

   `accelerate` also uses a few custom scripts to check for coding mistakes. Quality
   control runs in CI, however you can also run the same checks with:

   ```bash
   $ make quality
   ```

   You can also set up [`pre-commit`](https://pre-commit.com/) to run these checks
   automatically as Git commit hooks.

   ```bash
   $ pip install pre-commit
   $ pre-commit install
   ```

   Once you're happy with your changes, add changed files using `git add` and
   make a commit with `git commit` to record your changes locally:

   ```bash
   $ git add modified_file.py
   $ git commit
   ```

   Please write [good commit messages](https://chris.beams.io/posts/git-commit/).

   It is a good idea to sync your copy of the code with the original
   repository regularly. This way you can quickly account for changes:

   ```bash
   $ git fetch upstream
   $ git rebase upstream/main
   ```

   Push the changes to your account using:

   ```bash
   $ git push -u origin a-descriptive-name-for-my-changes
   ```

6. Once you are satisfied (**and the checklist below is happy too**), go to the
   webpage of your fork on GitHub. Click on 'Pull request' to send your changes
   to the project maintainers for review.

7. It's ok if maintainers ask you for changes. It happens to core contributors
   too! So everyone can see the changes in the Pull request, work in your local
   branch and push the changes to your fork. They will automatically appear in
   the pull request.


### Checklist

1. The title of your pull request should be a summary of its contribution;
2. If your pull request addresses an issue, please mention the issue number in
   the pull request description to make sure they are linked (and people
   consulting the issue know you are working on it);
3. To indicate a work in progress please prefix the title with `[WIP]`, or mark
   the PR as a draft PR. These are useful to avoid duplicated work, and to differentiate
   it from PRs ready to be merged;
4. Make sure existing tests pass;
5. Add high-coverage tests. No quality testing = no merge.

See an example of a good PR here: https://github.com/huggingface/accelerate/pull/255

### Tests

An extensive test suite is included to test the library behavior and several examples. Library tests can be found in
the [tests folder](https://github.com/huggingface/accelerate/tree/main/tests).

We use `pytest` in order to run the tests. From the root of the
repository, here's how to run tests with `pytest` for the library:

```bash
$ python -m pytest -sv ./tests
```

In fact, that's how `make test` is implemented (sans the `pip install` line)!

You can specify a smaller set of tests in order to test only the feature
you're working on.


================================================
FILE: LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: Makefile
================================================
.PHONY: quality style test docs utils

check_dirs := .

# Check that source code meets quality standards

extra_quality_checks:
	python utils/check_copies.py
	python utils/check_dummies.py
	python utils/check_repo.py

# this target runs checks on all files
quality:
	ruff check $(check_dirs)
	ruff format --check $(check_dirs)

# Format source code automatically and check is there are any problems left that need manual fixing
style:
	ruff check $(check_dirs) --fix
	ruff format $(check_dirs)
	
# Run tests for the library
test_core:
	python -m pytest -s -v ./tests/ \
	--ignore=./tests/test_big_modeling.py \
	--ignore=./tests/test_modeling_utils.py \
	--ignore=./tests/test_examples.py \
	--ignore=./tests/test_cli.py \
	--ignore=./tests/deepspeed \
	--ignore=./tests/fsdp \
	--ignore=./tests/tp \
	$(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_core.log",)

test_cli:
	python -m pytest -s -v ./tests/test_cli.py $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_cli.log",)

test_big_modeling:
	python -m pytest -s -v ./tests/test_big_modeling.py ./tests/test_modeling_utils.py $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_big_modeling.log",)

test_deepspeed:
	python -m pytest -s -v ./tests/deepspeed $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_deepspeed.log",)

test_fsdp:
	python -m pytest -s -v ./tests/fsdp $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_fsdp.log",)

test_tp:
	python -m pytest -s -v ./tests/tp $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_tp.log",)

# Since the new version of pytest will *change* how things are collected, we need `deepspeed` to 
# run after test_core and test_cli
test:
	$(MAKE) test_core
	$(MAKE) test_cli
	$(MAKE) test_big_modeling
	$(MAKE) test_deepspeed
	$(MAKE) test_fsdp
	$(MAKE) test_tp

test_examples:
	python -m pytest -s -v ./tests/test_examples.py $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_examples.log",)

# Broken down example tests for the CI runners
test_integrations:
	python -m pytest -s -v ./tests/fsdp ./tests/tp ./tests/deepspeed $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_integrations.log",)

test_example_differences:
	python -m pytest -s -v ./tests/test_examples.py::ExampleDifferenceTests $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_example_diff.log",)

test_checkpoint_epoch:
	python -m pytest -s -v ./tests/test_examples.py::FeatureExamplesTests -k "by_epoch" $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_checkpoint_epoch.log",)

test_checkpoint_step:
	python -m pytest -s -v ./tests/test_examples.py::FeatureExamplesTests -k "by_step" $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_checkpoint_step.log",)

# Same as test but used to install only the base dependencies
test_prod:
	$(MAKE) test_core

test_rest:
	python -m pytest -s -v ./tests/test_examples.py::FeatureExamplesTests -k "not by_step and not by_epoch" $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_rest.log",)

# For developers to prepare a release
prepare_release:
	rm -rf dist build
	python setup.py bdist_wheel sdist

# Make sure this is ran in a fresh venv of some form
install_test_release:
	pip uninstall accelerate -y
	pip install -i https://testpypi.python.org/pypi --extra-index-url https://pypi.org/simple accelerate$(if $(version),==$(version),)

# Run as `make target=testpypi upload_release`
upload_release:
	@if [ "$(target)" != "testpypi" ] && [ "$(target)" != "pypi" ]; then \
		echo "Error: target must be either 'testpypi' or 'pypi'"; \
		exit 1; \
	fi
	twine upload dist/* -r $(target)

================================================
FILE: README.md
================================================
<!---
Copyright 2021 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

<p align="center">
    <br>
    <img src="https://raw.githubusercontent.com/huggingface/accelerate/main/docs/source/imgs/accelerate_logo.png" width="400"/>
    <br>
<p>

<p align="center">
    <!-- Uncomment when CircleCI is set up
    <a href="https://circleci.com/gh/huggingface/accelerate"><img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/master"></a>
    -->
    <a href="https://github.com/huggingface/accelerate/blob/main/LICENSE"><img alt="License" src="https://img.shields.io/github/license/huggingface/accelerate.svg?color=blue"></a>
    <a href="https://huggingface.co/docs/accelerate/index.html"><img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/docs/accelerate/index.html.svg?down_color=red&down_message=offline&up_message=online"></a>
    <a href="https://github.com/huggingface/accelerate/releases"><img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/accelerate.svg"></a>
    <a href="https://github.com/huggingface/accelerate/blob/main/CODE_OF_CONDUCT.md"><img alt="Contributor Covenant" src="https://img.shields.io/badge/Contributor%20Covenant-v2.0%20adopted-ff69b4.svg"></a>
</p>

<h3 align="center">
<p>Run your *raw* PyTorch training script on any kind of device
</h3>

<h3 align="center">
    <a href="https://hf.co/course"><img src="https://raw.githubusercontent.com/huggingface/accelerate/main/docs/source/imgs/course_banner.png"></a>
</h3>

## Easy to integrate

🤗 Accelerate was created for PyTorch users who like to write the training loop of PyTorch models but are reluctant to write and maintain the boilerplate code needed to use multi-GPUs/TPU/fp16.

🤗 Accelerate abstracts exactly and only the boilerplate code related to multi-GPUs/TPU/fp16 and leaves the rest of your code unchanged.

Here is an example:

```diff
  import torch
  import torch.nn.functional as F
  from datasets import load_dataset
+ from accelerate import Accelerator

+ accelerator = Accelerator()
- device = 'cpu'
+ device = accelerator.device

  model = torch.nn.Transformer().to(device)
  optimizer = torch.optim.Adam(model.parameters())

  dataset = load_dataset('my_dataset')
  data = torch.utils.data.DataLoader(dataset, shuffle=True)

+ model, optimizer, data = accelerator.prepare(model, optimizer, data)

  model.train()
  for epoch in range(10):
      for source, targets in data:
          source = source.to(device)
          targets = targets.to(device)

          optimizer.zero_grad()

          output = model(source)
          loss = F.cross_entropy(output, targets)

-         loss.backward()
+         accelerator.backward(loss)

          optimizer.step()
```

As you can see in this example, by adding 5-lines to any standard PyTorch training script you can now run on any kind of single or distributed node setting (single CPU, single GPU, multi-GPUs and TPUs) as well as with or without mixed precision (fp8, fp16, bf16).

In particular, the same code can then be run without modification on your local machine for debugging or your training environment.

🤗 Accelerate even handles the device placement for you (which requires a few more changes to your code, but is safer in general), so you can even simplify your training loop further:

```diff
  import torch
  import torch.nn.functional as F
  from datasets import load_dataset
+ from accelerate import Accelerator

- device = 'cpu'
+ accelerator = Accelerator()

- model = torch.nn.Transformer().to(device)
+ model = torch.nn.Transformer()
  optimizer = torch.optim.Adam(model.parameters())

  dataset = load_dataset('my_dataset')
  data = torch.utils.data.DataLoader(dataset, shuffle=True)

+ model, optimizer, data = accelerator.prepare(model, optimizer, data)

  model.train()
  for epoch in range(10):
      for source, targets in data:
-         source = source.to(device)
-         targets = targets.to(device)

          optimizer.zero_grad()

          output = model(source)
          loss = F.cross_entropy(output, targets)

-         loss.backward()
+         accelerator.backward(loss)

          optimizer.step()
```

Want to learn more? Check out the [documentation](https://huggingface.co/docs/accelerate) or have a look at our [examples](https://github.com/huggingface/accelerate/tree/main/examples).

## Launching script

🤗 Accelerate also provides an optional CLI tool that allows you to quickly configure and test your training environment before launching the scripts. No need to remember how to use `torch.distributed.run` or to write a specific launcher for TPU training!
On your machine(s) just run:

```bash
accelerate config
```

and answer the questions asked. This will generate a config file that will be used automatically to properly set the default options when doing

```bash
accelerate launch my_script.py --args_to_my_script
``` 

For instance, here is how you would run the GLUE example on the MRPC task (from the root of the repo):

```bash
accelerate launch examples/nlp_example.py
```

This CLI tool is **optional**, and you can still use `python my_script.py` or `python -m torchrun my_script.py` at your convenience.

You can also directly pass in the arguments you would to `torchrun` as arguments to `accelerate launch` if you wish to not run` accelerate config`.

For example, here is how to launch on two GPUs:

```bash
accelerate launch --multi_gpu --num_processes 2 examples/nlp_example.py
```

To learn more, check the CLI documentation available [here](https://huggingface.co/docs/accelerate/package_reference/cli).

Or view the configuration zoo [here](https://github.com/huggingface/accelerate/blob/main/examples/config_yaml_templates/)

## Launching multi-CPU run using MPI

🤗 Here is another way to launch multi-CPU run using MPI. You can learn how to install Open MPI on [this page](https://www.open-mpi.org/faq/?category=building#easy-build). You can use Intel MPI or MVAPICH as well.
Once you have MPI setup on your cluster, just run:
```bash
accelerate config
```
Answer the questions that are asked, selecting to run using multi-CPU, and answer "yes" when asked if you want accelerate to launch mpirun.
Then, use `accelerate launch` with your script like:
```bash
accelerate launch examples/nlp_example.py
```
Alternatively, you can use mpirun directly, without using the CLI like:
```bash
mpirun -np 2 python examples/nlp_example.py
```

## Launching training using DeepSpeed

🤗 Accelerate supports training on single/multiple GPUs using DeepSpeed. To use it, you don't need to change anything in your training code; you can set everything using just `accelerate config`. However, if you desire to tweak your DeepSpeed related args from your Python script, we provide you the `DeepSpeedPlugin`.

```python
from accelerate import Accelerator, DeepSpeedPlugin

# deepspeed needs to know your gradient accumulation steps beforehand, so don't forget to pass it
# Remember you still need to do gradient accumulation by yourself, just like you would have done without deepspeed
deepspeed_plugin = DeepSpeedPlugin(zero_stage=2, gradient_accumulation_steps=2)
accelerator = Accelerator(mixed_precision='fp16', deepspeed_plugin=deepspeed_plugin)

# How to save your 🤗 Transformer?
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(save_dir, save_function=accelerator.save, state_dict=accelerator.get_state_dict(model))
```

Note: DeepSpeed support is experimental for now. In case you get into some problem, please open an issue.

## Launching your training from a notebook

🤗 Accelerate also provides a `notebook_launcher` function you can use in a notebook to launch a distributed training. This is especially useful for Colab or Kaggle notebooks with a TPU backend. Just define your training loop in a `training_function` then in your last cell, add:

```python
from accelerate import notebook_launcher

notebook_launcher(training_function)
```

An example can be found in [this notebook](https://github.com/huggingface/notebooks/blob/main/examples/accelerate_examples/simple_nlp_example.ipynb). [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/accelerate_examples/simple_nlp_example.ipynb)

## Why should I use 🤗 Accelerate?

You should use 🤗 Accelerate when you want to easily run your training scripts in a distributed environment without having to renounce full control over your training loop. This is not a high-level framework above PyTorch, just a thin wrapper so you don't have to learn a new library. In fact, the whole API of 🤗 Accelerate is in one class, the `Accelerator` object.

## Why shouldn't I use 🤗 Accelerate?

You shouldn't use 🤗 Accelerate if you don't want to write a training loop yourself. There are plenty of high-level libraries above PyTorch that will offer you that, 🤗 Accelerate is not one of them.

## Frameworks using 🤗 Accelerate

If you like the simplicity of 🤗 Accelerate but would prefer a higher-level abstraction around its capabilities, some frameworks and libraries that are built on top of 🤗 Accelerate are listed below:

* [Amphion](https://github.com/open-mmlab/Amphion) is a toolkit for Audio, Music, and Speech Generation. Its purpose is to support reproducible research and help junior researchers and engineers get started in the field of audio, music, and speech generation research and development.
* [Animus](https://github.com/Scitator/animus) is a minimalistic framework to run machine learning experiments. Animus highlights common "breakpoints" in ML experiments and provides a unified interface for them within [IExperiment](https://github.com/Scitator/animus/blob/main/animus/core.py#L76).
* [Catalyst](https://github.com/catalyst-team/catalyst#getting-started) is a PyTorch framework for Deep Learning Research and Development. It focuses on reproducibility, rapid experimentation, and codebase reuse so you can create something new rather than write yet another train loop. Catalyst provides a [Runner](https://catalyst-team.github.io/catalyst/api/core.html#runner) to connect all parts of the experiment: hardware backend, data transformations, model training, and inference logic.
* [fastai](https://github.com/fastai/fastai#installing) is a PyTorch framework for Deep Learning that simplifies training fast and accurate neural nets using modern best practices. fastai provides a [Learner](https://docs.fast.ai/learner.html#Learner) to handle the training, fine-tuning, and inference of deep learning algorithms.
* [Finetuner](https://github.com/jina-ai/finetuner) is a service that enables models to create higher-quality embeddings for semantic search, visual similarity search, cross-modal text<->image search, recommendation systems, clustering, duplication detection, anomaly detection, or other uses.
* [InvokeAI](https://github.com/invoke-ai/InvokeAI) is a creative engine for Stable Diffusion models, offering industry-leading WebUI, terminal usage support, and serves as the foundation for many commercial products.
* [Kornia](https://kornia.readthedocs.io/en/latest/get-started/introduction.html) is a differentiable library that allows classical computer vision to be integrated into deep learning models. Kornia provides a [Trainer](https://kornia.readthedocs.io/en/latest/x.html#kornia.x.Trainer) with the specific purpose to train and fine-tune the supported deep learning algorithms within the library.
* [Open Assistant](https://projects.laion.ai/Open-Assistant/) is a chat-based assistant that understands tasks, can interact with their party systems, and retrieve information dynamically to do so. 
* [pytorch-accelerated](https://github.com/Chris-hughes10/pytorch-accelerated) is a lightweight training library, with a streamlined feature set centered around a general-purpose [Trainer](https://pytorch-accelerated.readthedocs.io/en/latest/trainer.html), that places a huge emphasis on simplicity and transparency; enabling users to understand exactly what is going on under the hood, but without having to write and maintain the boilerplate themselves!
* [Stable Diffusion web UI](https://github.com/AUTOMATIC1111/stable-diffusion-webui) is an open-source browser-based easy-to-use interface based on the Gradio library for Stable Diffusion.
* [torchkeras](https://github.com/lyhue1991/torchkeras) is a simple tool for training pytorch model just in a keras style, a dynamic and beautiful plot is provided in notebook to monitor your loss or metric.
* [transformers](https://github.com/huggingface/transformers) as a tool for helping train state-of-the-art machine learning models in PyTorch, Tensorflow, and JAX. (Accelerate is the backend for the PyTorch side).


## Installation

This repository is tested on Python 3.8+ and PyTorch 1.10.0+

You should install 🤗 Accelerate in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).

First, create a virtual environment with the version of Python you're going to use and activate it.

Then, you will need to install PyTorch: refer to the [official installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific install command for your platform. Then 🤗 Accelerate can be installed using pip as follows:

```bash
pip install accelerate
```

## Supported integrations

- CPU only
- multi-CPU on one node (machine)
- multi-CPU on several nodes (machines)
- single GPU
- multi-GPU on one node (machine)
- multi-GPU on several nodes (machines)
- TPU
- FP16/BFloat16 mixed precision
- FP8 mixed precision with [Transformer Engine](https://github.com/NVIDIA/TransformerEngine) or [MS-AMP](https://github.com/Azure/MS-AMP/)
- DeepSpeed support (Experimental)
- PyTorch Fully Sharded Data Parallel (FSDP) support (Experimental)
- Megatron-LM support (Experimental)

## Citing 🤗 Accelerate

If you use 🤗 Accelerate in your publication, please cite it by using the following BibTeX entry.

```bibtex
@Misc{accelerate,
  title =        {Accelerate: Training and inference at scale made simple, efficient and adaptable.},
  author =       {Sylvain Gugger and Lysandre Debut and Thomas Wolf and Philipp Schmid and Zachary Mueller and Sourab Mangrulkar and Marc Sun and Benjamin Bossan},
  howpublished = {\url{https://github.com/huggingface/accelerate}},
  year =         {2022}
}
```


================================================
FILE: benchmarks/README.md
================================================
# Benchmarks

The folders below contain suites to test various functionalities in Accelerate.

See their relevant README.md's for more information.


================================================
FILE: benchmarks/big_model_inference/README.md
================================================
# Big model inference benchmarks

Running inference with Accelerate on big models.

## Setup

These benchmarks use the `transformers` library:

```bash
pip install transformers
```

To reproduce or test a new setup, run

```py
python big_model_inference.py model_name
```

This script supports `gpt-j-6b`, `gpt-neox`, `opt` (30B version) and `T0pp` out of the box, but you can specify any valid checkpoint for `model_name`.

To force a different `torch_dtype` than the one in the config: `--torch_dtype xxx`.

If you get an error linked to disk offload, you need to add the option `--disk-offload`

## Results

On a setup with two Titan RTXs (24GB of RAM) and 32GB of RAM, we get the following benchmarks (T0pp does not run in float16, which is why it's not included).

| Model | Model load time | Generation time | dtype | GPU 0 use | GPU 1 use | CPU use | Disk offload |
|:-----:|:---------------:|:---------------:|:-----:|:---------:|:---------:|:-------:|:------------:|
| GPT-J-6B | 8.7s | 0.05s per token | float16 | 11.7GB | 0GB | 0GB | no |
| GPT-J-6B | 12.4s | 0.06s per token | float32 | 21.9GB | 1.5GB | 0GB | no |
| GPT-Neo-X-20B | 30.9s | 0.08s per token | float16 | 21.5GB | 18GB | 0GB | no |
| GPT-Neo-X-20B | 78.2s | 10.72s per token | float32 | 20.3GB | 22.7 GB | 24.4GB | yes |
| T0pp (11B) | 29.4s | 0.05s per token | float32 | 21.1GB | 21.3GB | 0GB | no |
| OPT-30B | 34.5s | 2.37s per token | float16 | 20.7GB | 22.3GB | 14.1GB | no |
| OPT-30B | 112.3s | 33.9s per token | float32 | 20.2GB | 21.2GB | 23.5GB | yes |

Note on the results:
- using two GPUs instead of one does not slow down generation
- using CPU offload slows down a bit (see OPT-30b)
- using disk offload slows down a lot (need to implement prefetching)

You will also note that Accelerate does not use anymore GPU and CPU RAM than necessary:
- peak GPU memory is exactly the size of the model put on a given GPU
- peak CPU memory is either the size of the biggest checkpoint shard or the part of the model offloaded on CPU, whichever is bigger.


================================================
FILE: benchmarks/big_model_inference/big_model_inference.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import time

import torch
import transformers
from measures_util import end_measure, log_measures, start_measure
from transformers import AutoConfig, AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer

from accelerate.utils import compute_module_sizes


DEFAULT_MODELS = {
    "gpt-j-6b": {"is_causal": True, "model": "sgugger/sharded-gpt-j-6B", "tokenizer": "EleutherAI/gpt-j-6B"},
    "gpt-neox": {"is_causal": True, "model": "EleutherAI/gpt-neox-20b"},
    "opt": {"is_causal": True, "model": "facebook/opt-30b"},
    "T0pp": {"is_causal": False, "model": "bigscience/T0pp", "model_revision": "sharded"},
}

PROMPTS = [
    "Hello, my name is",
    "Are unicorns real? Unicorns are",
    "For the first time in several years,",
    "My name is Julien and I am",
    "The goal of life is",
    "Whenever I'm sad, I like to",
]


def parse_args():
    parser = argparse.ArgumentParser(description="Run and time generations on a big model using Accelerate.")
    parser.add_argument("model_name", type=str, default=None, help="The name of the model to try.")
    parser.add_argument(
        "--tokenizer_name", type=str, default=None, help="The name of the tokenizer (if different from the model."
    )
    parser.add_argument("--is_causal", type=bool, default=None, help="Whether or not the model is causal.")
    parser.add_argument(
        "--model_revision", type=str, default=None, help="The revision to use for the model checkpoint."
    )
    parser.add_argument("--torch_dtype", type=str, default=None, help="The dtype for the model.")
    parser.add_argument("--disk_offload", action="store_true")

    args = parser.parse_args()

    # Sanitize args
    if args.model_name in DEFAULT_MODELS:
        defaults = DEFAULT_MODELS[args.model_name]
        args.model_name = defaults["model"]
        if args.tokenizer_name is None:
            args.tokenizer_name = defaults.get("tokenizer", args.model_name)
        if args.is_causal is None:
            args.is_causal = defaults["is_causal"]
        if args.model_revision is None:
            args.model_revision = defaults.get("model_revision", "main")

    if args.is_causal is None:
        raise ValueError("Could not infer the default for `--is_causal`, pass either True or False for it.")
    if args.tokenizer_name is None:
        args.tokenizer_name = args.model_name
    if args.model_revision is None:
        args.model_revision = "main"

    return args


def main():
    transformers.utils.logging.set_verbosity_error()
    args = parse_args()

    if args.torch_dtype is None:
        config = AutoConfig.from_pretrained(args.model_name)
        torch_dtype = getattr(config, "torch_dtype", torch.float32)
    else:
        torch_dtype = getattr(torch, args.torch_dtype)
    model_cls = AutoModelForCausalLM if args.is_causal else AutoModelForSeq2SeqLM
    kwargs = {
        "torch_dtype": torch_dtype,
        "revision": args.model_revision,
    }
    if args.disk_offload:
        kwargs["offload_folder"] = "tmp_offload"
        kwargs["offload_state_dict"] = True

    start_measures = start_measure()
    model = model_cls.from_pretrained(args.model_name, device_map="auto", **kwargs)
    end_measures = end_measure(start_measures)
    log_measures(end_measures, "Model loading")

    module_sizes = compute_module_sizes(model)
    device_size = {v: 0 for v in model.hf_device_map.values()}
    for module, device in model.hf_device_map.items():
        device_size[device] += module_sizes[module]
    message = "\n".join([f"- {device}: {size // 2**20}MiB" for device, size in device_size.items()])
    print(f"\nTheoretical use:\n{message}")

    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)

    start_measures = start_measure()
    generation_times = []
    gen_tokens = []
    texts_outs = []
    for prompt in PROMPTS:
        inputs = tokenizer(prompt, return_tensors="pt").to(0)
        tokens = inputs["input_ids"][0].tolist()
        before_generate = time.time()
        outputs = model.generate(inputs["input_ids"])
        after_generate = time.time()
        outputs = outputs[0].tolist()
        num_gen_tokens = len(outputs) if outputs[: len(tokens)] != tokens else len(outputs) - len(tokens)
        generation_time = after_generate - before_generate

        text_out = tokenizer.decode(outputs, skip_special_tokens=True)
        texts_outs.append(text_out)
        generation_times.append(generation_time)
        gen_tokens.append(num_gen_tokens)
        print(f"Prompt: {prompt}\nGeneration {text_out}\nIn {generation_time:.2f}s for {num_gen_tokens} tokens\n")

    end_measures = end_measure(start_measures)
    log_measures(end_measures, "Model generation")

    generation_times_per_token = [gen / tok for gen, tok in zip(generation_times, gen_tokens)]
    avg_gen = sum(generation_times_per_token) / len(generation_times)
    print(f"Average time of generation per token: {avg_gen:.2f}s")
    print(f"First generation (avg time per token): {generation_times_per_token[0]:.2f}s")
    avg_gen = sum(generation_times_per_token[1:]) / (len(generation_times_per_token) - 1)
    print(f"Average time of generation per token (excluding the first): {avg_gen:.2f}s")


if __name__ == "__main__":
    main()


================================================
FILE: benchmarks/big_model_inference/measures_util.py
================================================
# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import threading
import time

import psutil
import torch

from accelerate.test_utils.testing import get_backend


torch_device_type, _, _ = get_backend()
torch_accelerator_module = getattr(torch, torch_device_type, torch.cuda)


class PeakCPUMemory:
    def __init__(self):
        self.process = psutil.Process()
        self.peak_monitoring = False

    def peak_monitor(self):
        self.cpu_memory_peak = -1

        while True:
            self.cpu_memory_peak = max(self.process.memory_info().rss, self.cpu_memory_peak)

            # can't sleep or will not catch the peak right (this comment is here on purpose)
            if not self.peak_monitoring:
                break

    def start(self):
        self.peak_monitoring = True
        self.thread = threading.Thread(target=self.peak_monitor)
        self.thread.daemon = True
        self.thread.start()

    def stop(self):
        self.peak_monitoring = False
        self.thread.join()
        return self.cpu_memory_peak


cpu_peak_tracker = PeakCPUMemory()


def start_measure():
    # Time
    measures = {"time": time.time()}

    gc.collect()
    torch_accelerator_module.empty_cache()

    # CPU mem
    measures["cpu"] = psutil.Process().memory_info().rss
    cpu_peak_tracker.start()

    # GPU mem
    for i in range(torch_accelerator_module.device_count()):
        measures[str(i)] = torch_accelerator_module.memory_allocated(i)
    torch_accelerator_module.reset_peak_memory_stats()

    return measures


def end_measure(start_measures):
    # Time
    measures = {"time": time.time() - start_measures["time"]}

    gc.collect()
    torch_accelerator_module.empty_cache()

    # CPU mem
    measures["cpu"] = (psutil.Process().memory_info().rss - start_measures["cpu"]) / 2**20
    measures["cpu-peak"] = (cpu_peak_tracker.stop() - start_measures["cpu"]) / 2**20

    # GPU mem
    for i in range(torch_accelerator_module.device_count()):
        measures[str(i)] = (torch_accelerator_module.memory_allocated(i) - start_measures[str(i)]) / 2**20
        measures[f"{i}-peak"] = (torch_accelerator_module.max_memory_allocated(i) - start_measures[str(i)]) / 2**20

    return measures


def log_measures(measures, description):
    print(f"{description}:")
    print(f"- Time: {measures['time']:.2f}s")
    for i in range(torch_accelerator_module.device_count()):
        print(f"- {torch_device_type} {i} allocated: {measures[str(i)]:.2f}MiB")
        peak = measures[f"{i}-peak"]
        print(f"- {torch_device_type} {i} peak: {peak:.2f}MiB")
    print(f"- CPU RAM allocated: {measures['cpu']:.2f}MiB")
    print(f"- CPU RAM peak: {measures['cpu-peak']:.2f}MiB")


================================================
FILE: benchmarks/fp8/ms_amp/Dockerfile
================================================
FROM ghcr.io/azure/msamp

RUN pip install transformers evaluate datasets
RUN git clone https://github.com/huggingface/accelerate

RUN cd accelerate && \
    pip install -e . && \
    cd benchmarks/fp8

CMD ["bash"]


================================================
FILE: benchmarks/fp8/ms_amp/ddp.py
================================================
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
This script tests to ensure that `accelerate` performs at the same level as raw `MS-AMP`.

This particular script verifies this for DDP training.
"""

import evaluate
import msamp
import torch
from fp8_utils import evaluate_model, get_training_utilities
from torch.nn.parallel import DistributedDataParallel as DDP

from accelerate import Accelerator
from accelerate.state import AcceleratorState
from accelerate.utils import FP8RecipeKwargs, get_grad_scaler, set_seed


MODEL_NAME = "bert-base-cased"
METRIC = evaluate.load("glue", "mrpc")


def train_baseline(opt_level="O2"):
    set_seed(42)
    scaler = get_grad_scaler()
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(MODEL_NAME)
    accelerator = Accelerator()
    device = accelerator.device

    model, optimizer = msamp.initialize(model, optimizer, opt_level=opt_level)

    model.to(device)

    # Convert the model to DDP
    device_ids, output_device = [accelerator.local_process_index], accelerator.local_process_index
    model = DDP(model, device_ids=device_ids, output_device=output_device)

    base_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
    model.train()

    for i, batch in enumerate(train_dataloader):
        with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
            outputs = model(**batch)
            loss = outputs.loss
        scaler.scale(loss).backward()
        optimizer.step()
        optimizer.zero_grad()
        lr_scheduler.step()

    trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)

    assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
        f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
    )
    assert trained_model_results["f1"] > base_model_results["f1"], (
        f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
    )

    return base_model_results, trained_model_results


def train_integration(opt_level="O2"):
    kwargs_handlers = [FP8RecipeKwargs(backend="msamp", opt_level=opt_level)]
    AcceleratorState()._reset_state(True)
    accelerator = Accelerator(mixed_precision="fp8", kwargs_handlers=kwargs_handlers)
    set_seed(42)
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(
        MODEL_NAME, accelerator=accelerator
    )

    model, optimizer = accelerator.prepare(model, optimizer)
    base_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
    model.train()
    for i, batch in enumerate(train_dataloader):
        with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
            outputs = model(**batch)
            loss = outputs.loss
        accelerator.backward(loss)
        optimizer.step()
        optimizer.zero_grad()
        lr_scheduler.step()

    trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)

    assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
        f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
    )
    assert trained_model_results["f1"] > base_model_results["f1"], (
        f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
    )

    return base_model_results, trained_model_results


if __name__ == "__main__":
    for opt_level in ["O1", "O2"]:
        baseline_not_trained, baseline_trained = train_baseline(opt_level)
        accelerator_not_trained, accelerator_trained = train_integration(opt_level)
        assert baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"], (
            f"Accuracy not the same for untrained baseline and accelerator using opt_level={opt_level}: {baseline_not_trained['accuracy']} == {accelerator_not_trained['accuracy']}"
        )
        assert baseline_not_trained["f1"] == accelerator_not_trained["f1"], (
            f"F1 not the same for untrained baseline and accelerator using opt_level={opt_level}: {baseline_not_trained['f1']} == {accelerator_not_trained['f1']}"
        )
        assert baseline_trained["accuracy"] == accelerator_trained["accuracy"], (
            f"Accuracy not the same for trained baseline and accelerator using opt_level={opt_level}: {baseline_trained['accuracy']} == {accelerator_trained['accuracy']}"
        )
        assert baseline_trained["f1"] == accelerator_trained["f1"], (
            f"F1 not the same for trained baseline and accelerator using opt_level={opt_level}: {baseline_trained['f1']} == {accelerator_trained['f1']}"
        )


================================================
FILE: benchmarks/fp8/ms_amp/distrib_deepspeed.py
================================================
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
This script tests to ensure that `accelerate` performs at the same level as raw `MS-AMP`.

This particular script verifies this for DeepSpeed training.

NOTE: MS-AMP does *not* support ZeRO-3.
"""

# import msamp.deepspeed as msamp_deepspeed
import evaluate
import torch
from fp8_utils import evaluate_model, get_training_utilities
from msamp import deepspeed as msamp_deepspeed

from accelerate import Accelerator, DeepSpeedPlugin
from accelerate.state import AcceleratorState
from accelerate.utils import set_seed


MODEL_NAME = "bert-base-cased"
METRIC = evaluate.load("glue", "mrpc")


def train_baseline(zero_stage: int = 1, opt_level: str = "O1"):
    set_seed(42)
    accelerator = Accelerator()
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(
        MODEL_NAME, accelerator=accelerator
    )

    import numpy as np

    config = {
        "train_batch_size": 32,
        "train_micro_batch_size_per_gpu": 16,
        "gradient_accumulation_steps": 1,
        "zero_optimization": {
            "stage": zero_stage,
            "offload_optimizer": {"device": "none", "nvme_path": None},
            "offload_param": {"device": "none", "nvme_path": None},
        },
        "gradient_clipping": 1.0,
        "steps_per_print": np.inf,
        "bf16": {"enabled": True},
        "fp16": {"enabled": False},
        "zero_allow_untested_optimizer": True,
        "msamp": {
            "enabled": True,
            "opt_level": opt_level,
        },
    }
    (
        model,
        optimizer,
        _,
        _,
    ) = msamp_deepspeed.initialize(
        model=model,
        optimizer=optimizer,
        config_params=config,
    )

    base_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
    model.train()

    for _ in range(2):
        for batch in train_dataloader:
            outputs = model(**batch)
            loss = outputs.loss
            model.backward(loss)
            model.step()
            for _ in range(accelerator.num_processes):
                lr_scheduler.step()

    trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
    model.destroy()
    torch.cuda.empty_cache()
    AcceleratorState()._reset_state(True)
    assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
        f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
    )
    assert trained_model_results["f1"] > base_model_results["f1"], (
        f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
    )

    return base_model_results, trained_model_results


def train_integration(zero_stage: int = 1, opt_level: str = "O1"):
    set_seed(42)
    deepspeed_plugin = DeepSpeedPlugin(
        zero_stage=zero_stage,
        enable_msamp=True,
        msamp_opt_level=opt_level,
    )
    accelerator = Accelerator(mixed_precision="fp8", deepspeed_plugin=deepspeed_plugin)
    accelerator.state.deepspeed_plugin.deepspeed_config["train_micro_batch_size_per_gpu"] = 16

    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(
        MODEL_NAME, accelerator=accelerator
    )

    model, optimizer, lr_scheduler = accelerator.prepare(model, optimizer, lr_scheduler)
    base_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
    model.train()
    for _ in range(2):
        for batch in train_dataloader:
            outputs = model(**batch)
            loss = outputs.loss
            accelerator.backward(loss)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

    trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
    model.destroy()
    torch.cuda.empty_cache()
    assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
        f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
    )
    assert trained_model_results["f1"] > base_model_results["f1"], (
        f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
    )

    AcceleratorState()._reset_state(True)
    return base_model_results, trained_model_results


if __name__ == "__main__":
    for zero_stage in [1, 2]:
        for opt_level in ["O1", "O2", "O3"]:
            baseline_not_trained, baseline_trained = train_baseline(zero_stage, opt_level)
            accelerator_not_trained, accelerator_trained = train_integration(zero_stage, opt_level)
            assert baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"], (
                f"ZERO stage {zero_stage}, opt_level={opt_level}:\nAccuracy should be the same for the baseline and accelerator: {baseline_not_trained['accuracy']} == {accelerator_not_trained['accuracy']}"
            )
            assert baseline_not_trained["f1"] == accelerator_not_trained["f1"], (
                f"ZERO stage {zero_stage}, opt_level={opt_level}:\nF1 score should be the same for the baseline and accelerator: {baseline_not_trained['f1']} == {accelerator_not_trained['f1']}"
            )
            assert baseline_trained["accuracy"] == accelerator_trained["accuracy"], (
                f"ZERO stage {zero_stage}, opt_level={opt_level}:\nAccuracy should be the same for the baseline and accelerator: {baseline_trained['accuracy']} == {accelerator_trained['accuracy']}"
            )
            assert baseline_trained["f1"] == accelerator_trained["f1"], (
                f"ZERO stage {zero_stage}, opt_level={opt_level}:\nF1 score should be the same for the baseline and accelerator: {baseline_trained['f1']} == {accelerator_trained['f1']}"
            )

    torch.distributed.destroy_process_group()


================================================
FILE: benchmarks/fp8/ms_amp/fp8_utils.py
================================================
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch


def get_dataloaders(model_name: str, batch_size: int = 16):
    from datasets import load_dataset
    from torch.utils.data import DataLoader
    from transformers import AutoTokenizer

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    datasets = load_dataset("glue", "mrpc")

    def tokenize_function(examples):
        # max_length=None => use the model max length (it's actually the default)
        outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
        return outputs

    # Apply the method we just defined to all the examples in all the splits of the dataset
    # starting with the main process first:
    tokenized_datasets = datasets.map(
        tokenize_function,
        batched=True,
        remove_columns=["idx", "sentence1", "sentence2"],
    )

    # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
    # transformers library
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

    def collate_fn(examples):
        return tokenizer.pad(
            examples,
            padding="longest",
            pad_to_multiple_of=16,  # Specific for FP8
            return_tensors="pt",
        )

    # Instantiate dataloaders.
    train_dataloader = DataLoader(
        tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size, drop_last=True
    )
    eval_dataloader = DataLoader(
        tokenized_datasets["validation"],
        shuffle=False,
        collate_fn=collate_fn,
        batch_size=16,
        drop_last=True,
    )

    return train_dataloader, eval_dataloader


def get_training_utilities(model_name: str, batch_size: int = 16, accelerator=None):
    """
    Returns a tuple of:
        - Model
        - Optimizer
        - Train dataloader (prepared)
        - Eval dataloader (prepared)
        - LR Scheduler
    Suitable for training on the MRPC dataset
    """
    from torch.optim import AdamW
    from transformers import AutoModelForSequenceClassification, get_linear_schedule_with_warmup

    from accelerate import Accelerator

    if accelerator is None:
        accelerator = Accelerator()
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    train_dataloader, eval_dataloader = get_dataloaders(model_name, batch_size)
    optimizer = AdamW(model.parameters(), lr=0.0001)
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=100,
        num_training_steps=len(train_dataloader) * 2,
    )
    train_dataloader, eval_dataloader = accelerator.prepare(train_dataloader, eval_dataloader)
    return model, optimizer, train_dataloader, eval_dataloader, lr_scheduler


def get_named_parameters(model):
    """
    Same thing as `Accelerator.get_named_parameters` Returns a list of the named parameters of the model (extracted
    from parallel)
    """
    from accelerate.utils import extract_model_from_parallel

    model = extract_model_from_parallel(model)
    return {n: p for n, p in model.named_parameters()}


def evaluate_model(model, dataloader, metric, accelerator=None):
    "Turns model to .eval(), runs dataloader, calculates metric, then turns eval back on"
    model.eval()
    for step, batch in enumerate(dataloader):
        with torch.no_grad():
            # W/ MS-AMP, we need to cast while evaluating
            with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
                outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        references = batch["labels"]
        if accelerator is not None and accelerator.num_processes > 1:
            predictions, references = accelerator.gather_for_metrics((predictions, references))
        metric.add_batch(predictions=predictions, references=references)
    return metric.compute()


================================================
FILE: benchmarks/fp8/ms_amp/non_distributed.py
================================================
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
This script tests to ensure that `accelerate` performs at the same level as raw `MS-AMP`.

This particular script verifies this for single GPU training.
"""

import evaluate
import msamp
import torch
from fp8_utils import evaluate_model, get_training_utilities

from accelerate import Accelerator
from accelerate.state import AcceleratorState
from accelerate.utils import FP8RecipeKwargs, get_grad_scaler, set_seed


MODEL_NAME = "bert-base-cased"
METRIC = evaluate.load("glue", "mrpc")


def train_baseline(opt_level="O2"):
    set_seed(42)
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(MODEL_NAME)

    model, optimizer = msamp.initialize(model, optimizer, opt_level=opt_level)
    model.to("cuda")

    base_model_results = evaluate_model(model, eval_dataloader, METRIC)
    model.train()
    scaler = get_grad_scaler()

    for batch in train_dataloader:
        batch = batch.to("cuda")
        with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
            outputs = model(**batch)
        loss = outputs.loss
        loss = scaler.scale(loss)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        lr_scheduler.step()

    trained_model_results = evaluate_model(model, eval_dataloader, METRIC)

    assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
        f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
    )
    assert trained_model_results["f1"] > base_model_results["f1"], (
        f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
    )

    return base_model_results, trained_model_results


def train_integration(opt_level="O2"):
    kwargs_handlers = [FP8RecipeKwargs(backend="msamp", opt_level=opt_level)]
    AcceleratorState()._reset_state(True)
    accelerator = Accelerator(mixed_precision="fp8", kwargs_handlers=kwargs_handlers)
    set_seed(42)
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(
        MODEL_NAME, accelerator=accelerator
    )

    model, optimizer, lr_scheduler = accelerator.prepare(model, optimizer, lr_scheduler)
    base_model_results = evaluate_model(model, eval_dataloader, METRIC)
    model.train()

    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)
        optimizer.step()
        optimizer.zero_grad()
        lr_scheduler.step()

    trained_model_results = evaluate_model(model, eval_dataloader, METRIC)

    assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
        f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
    )
    assert trained_model_results["f1"] > base_model_results["f1"], (
        f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
    )

    return base_model_results, trained_model_results


if __name__ == "__main__":
    for opt_level in ["O1", "O2"]:
        baseline_not_trained, baseline_trained = train_baseline(opt_level)
        accelerator_not_trained, accelerator_trained = train_integration(opt_level)

        assert baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"], (
            f"Accuracy should be the same for the baseline and accelerator: {baseline_not_trained['accuracy']} == {accelerator_not_trained['accuracy']}"
        )
        assert baseline_not_trained["f1"] == accelerator_not_trained["f1"], (
            f"F1 score should be the same for the baseline and accelerator: {baseline_not_trained['f1']} == {accelerator_not_trained['f1']}"
        )
        assert baseline_trained["accuracy"] == accelerator_trained["accuracy"], (
            f"Accuracy should be the same for the baseline and accelerator: {baseline_trained['accuracy']} == {accelerator_trained['accuracy']}"
        )
        assert baseline_trained["f1"] == accelerator_trained["f1"], (
            f"F1 score should be the same for the baseline and accelerator: {baseline_trained['f1']} == {accelerator_trained['f1']}"
        )


================================================
FILE: benchmarks/fp8/torchao/Dockerfile
================================================
FROM nvcr.io/nvidia/pytorch:24.07-py3

RUN pip install transformers evaluate datasets
RUN git clone https://github.com/huggingface/accelerate.git

RUN cd accelerate && \
    pip install -e . && \
    cd benchmarks/fp8

RUN /bin/bash


================================================
FILE: benchmarks/fp8/torchao/README.md
================================================
# FP8 Benchmarks

Comparing and running [torchao](https://github.com/pytorch/ao/tree/main/torchao/float8) FP8 with accelerate

## Overview

This repo provides scripts which compare native `torchao` model training against `accelerate`'s own integration. Each modeling type is segmented out via a script, supporting the following:

* Single GPU training (`non_distributed.py`)
* Multi-GPU training via DistributedDataParallelism (`ddp.py`)
* Fully Sharded Data Parallelism (`fsdp.py`)
* DeepSpeed ZeRO 1-3 (`deepspeed.py`)

To run them, it's recommended to use a docker image (see the attached `Dockerfile`) and not install `torchao` manually.

## Running:

There are official Docker images located at `huggingface/accelerate:gpu-fp8-torchao-nightly` which can be used.

You can run all scripts using the core `accelerate launch` command without any `accelerate config` being needed.

For single GPU, run it via `python`:

```bash
python non_distributed.py
```

For the rest, run it via `accelerate launch`:

```bash
accelerate launch ddp.py # or distrib_deepspeed.py, ddp.py
```

================================================
FILE: benchmarks/fp8/torchao/ddp.py
================================================
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
This script tests to ensure that `accelerate` performs at the same level as raw `torchao`.

This particular script verifies this for DDP training.
"""

from functools import partial

import evaluate
import torch
from fp8_utils import get_training_utilities
from torch.nn.parallel import DistributedDataParallel as DDP
from torchao.float8 import convert_to_float8_training

from accelerate import Accelerator
from accelerate.state import AcceleratorState
from accelerate.utils import AORecipeKwargs, set_seed


MODEL_NAME = "bert-base-cased"
METRIC = evaluate.load("glue", "mrpc")


def evaluate_model(model, dataloader, metric, accelerator=None):
    "Turns model to .eval(), runs dataloader, calculates metric, then turns eval back on"
    model.eval()
    for step, batch in enumerate(dataloader):
        with torch.no_grad():
            outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        references = batch["labels"]
        if accelerator is not None and accelerator.num_processes > 1:
            predictions, references = accelerator.gather_for_metrics((predictions, references))
        metric.add_batch(predictions=predictions, references=references)
    return metric.compute()


def filter_linear_layers(module, fqn, first_layer_name=None, last_layer_name=None):
    if isinstance(module, torch.nn.Linear):
        if module.in_features % 16 != 0 or module.out_features % 16 != 0:
            return False
    # For stability reasons, we skip the first and last linear layers
    # Otherwise can lead to the model not training or converging properly
    if fqn in (first_layer_name, last_layer_name):
        return False
    return True


def train_baseline():
    set_seed(42)
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(MODEL_NAME)
    first_linear = None
    last_linear = None
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            if first_linear is None:
                first_linear = name
            last_linear = name
    func = partial(filter_linear_layers, first_layer_name=first_linear, last_layer_name=last_linear)
    accelerator = Accelerator()
    device = accelerator.device
    model.to(device)

    convert_to_float8_training(model, module_filter_fn=func)

    # Convert the model to DDP
    device_ids, output_device = [accelerator.local_process_index], accelerator.local_process_index
    model = DDP(model, device_ids=device_ids, output_device=output_device)

    base_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
    model.train()

    for batch in train_dataloader:
        with torch.autocast(device_type=device.type, dtype=torch.bfloat16):
            batch = batch.to(device)
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        lr_scheduler.step()

    trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)

    assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
        f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
    )
    assert trained_model_results["f1"] > base_model_results["f1"], (
        f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
    )

    return base_model_results, trained_model_results


def train_integration():
    AcceleratorState()._reset_state(True)
    accelerator = Accelerator(mixed_precision="fp8", kwargs_handlers=[AORecipeKwargs()])
    set_seed(42)
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(
        MODEL_NAME, accelerator=accelerator
    )

    model, optimizer = accelerator.prepare(model, optimizer)
    base_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
    model.train()

    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)
        optimizer.step()
        optimizer.zero_grad()
        lr_scheduler.step()

    trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)

    assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
        f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
    )
    assert trained_model_results["f1"] > base_model_results["f1"], (
        f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
    )

    return base_model_results, trained_model_results


if __name__ == "__main__":
    baseline_not_trained, baseline_trained = train_baseline()
    accelerator_not_trained, accelerator_trained = train_integration()

    assert baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"], (
        f"Accuracy should be the same for the baseline and accelerator: {baseline_not_trained['accuracy']} == {accelerator_not_trained['accuracy']}"
    )
    assert baseline_not_trained["f1"] == accelerator_not_trained["f1"], (
        f"F1 score should be the same for the baseline and accelerator: {baseline_not_trained['f1']} == {accelerator_not_trained['f1']}"
    )
    assert baseline_trained["accuracy"] == accelerator_trained["accuracy"], (
        f"Accuracy should be the same for the baseline and accelerator: {baseline_trained['accuracy']} == {accelerator_trained['accuracy']}"
    )
    assert baseline_trained["f1"] == accelerator_trained["f1"], (
        f"F1 score should be the same for the baseline and accelerator: {baseline_trained['f1']} == {accelerator_trained['f1']}"
    )

    torch.distributed.destroy_process_group()


================================================
FILE: benchmarks/fp8/torchao/distrib_deepspeed.py
================================================
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
This script tests to ensure that `accelerate` performs at the same level as raw `torchao`.

This particular script verifies this for deepspeed training.
"""

from functools import partial
from unittest.mock import patch

import deepspeed
import evaluate
import torch
from fp8_utils import evaluate_model, get_training_utilities
from torchao.float8 import convert_to_float8_training
from transformers.integrations import HfDeepSpeedConfig

from accelerate import Accelerator, DeepSpeedPlugin
from accelerate.state import AcceleratorState
from accelerate.utils import AORecipeKwargs, set_seed


MODEL_NAME = "bert-base-cased"
METRIC = evaluate.load("glue", "mrpc")


def filter_linear_layers(module, fqn, first_layer_name=None, last_layer_name=None):
    if isinstance(module, torch.nn.Linear):
        if module.in_features % 16 != 0 or module.out_features % 16 != 0:
            return False
    # For stability reasons, we skip the first and last linear layers
    # Otherwise can lead to the model not training or converging properly
    if fqn in (first_layer_name, last_layer_name):
        return False
    return True


def train_baseline(zero_stage: int = 1):
    set_seed(42)
    # This forces transformers to think Zero-3 Init should be used
    with patch("transformers.integrations.deepspeed.is_deepspeed_zero3_enabled") as mock:
        mock.return_value = zero_stage == 3

    config = HfDeepSpeedConfig(
        {
            "train_micro_batch_size_per_gpu": 16,
            "gradient_accumulation_steps": 1,
            "zero_optimization": {"stage": zero_stage},
        }
    )
    plugin = DeepSpeedPlugin(hf_ds_config=config)
    accelerator = Accelerator(deepspeed_plugin=plugin)
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(
        MODEL_NAME, accelerator=accelerator
    )
    first_linear = None
    last_linear = None
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            if first_linear is None:
                first_linear = name
            last_linear = name
    func = partial(filter_linear_layers, first_layer_name=first_linear, last_layer_name=last_linear)

    convert_to_float8_training(model, module_filter_fn=func)

    import numpy as np

    config = {
        "train_batch_size": 32,
        "train_micro_batch_size_per_gpu": 16,
        "gradient_accumulation_steps": 1,
        "zero_optimization": {
            "stage": zero_stage,
            "offload_optimizer": {"device": "none", "nvme_path": None},
            "offload_param": {"device": "none", "nvme_path": None},
            "stage3_gather_16bit_weights_on_model_save": False,
        },
        "gradient_clipping": 1.0,
        "steps_per_print": np.inf,
        "bf16": {"enabled": True},
        "fp16": {"enabled": False},
        "zero_allow_untested_optimizer": True,
    }

    (
        model,
        optimizer,
        _,
        lr_scheduler,
    ) = deepspeed.initialize(
        model=model,
        optimizer=optimizer,
        lr_scheduler=lr_scheduler,
        config_params=config,
    )

    base_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
    model.train()

    model_outputs = []
    data = []

    for batch in train_dataloader:
        outputs = model(**batch)
        data.append(batch.to("cpu"))
        model_outputs.append(outputs.logits.to("cpu"))
        loss = outputs.loss
        model.backward(loss)
        model.step()
        for _ in range(accelerator.num_processes):
            lr_scheduler.step()

    trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
    model.destroy()
    assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
        f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
    )
    assert trained_model_results["f1"] > base_model_results["f1"], (
        f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
    )

    del config
    return base_model_results, trained_model_results, model_outputs, data


def train_integration(zero_stage: int = 1):
    set_seed(42)
    AcceleratorState()._reset_state(True)
    config = HfDeepSpeedConfig(
        {
            "train_micro_batch_size_per_gpu": 16,
            "gradient_accumulation_steps": 1,
            "zero_optimization": {"stage": zero_stage},
        }
    )
    deepspeed_plugin = DeepSpeedPlugin(
        hf_ds_config=config,
    )
    # This forces transformers to think Zero-3 Init should be used
    with patch("transformers.integrations.deepspeed.is_deepspeed_zero3_enabled") as mock:
        mock.return_value = zero_stage == 3
    accelerator = Accelerator(
        mixed_precision="fp8", kwargs_handlers=[AORecipeKwargs()], deepspeed_plugin=deepspeed_plugin
    )

    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(
        MODEL_NAME, accelerator=accelerator
    )

    model, optimizer, lr_scheduler, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, lr_scheduler, train_dataloader, eval_dataloader
    )
    base_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
    model.train()
    model_outputs = []
    data = []
    for batch in train_dataloader:
        outputs = model(**batch)
        data.append(batch.to("cpu"))
        model_outputs.append(outputs.logits.to("cpu"))
        loss = outputs.loss
        accelerator.backward(loss)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
    model.destroy()
    assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
        f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
    )
    assert trained_model_results["f1"] > base_model_results["f1"], (
        f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
    )

    del config
    return base_model_results, trained_model_results, model_outputs, data


if __name__ == "__main__":
    for zero_stage in [1, 2, 3]:
        baseline_not_trained, baseline_trained, baseline_outputs, baseline_data = train_baseline(zero_stage)
        accelerator_not_trained, accelerator_trained, accelerator_outputs, accelerator_data = train_integration(
            zero_stage
        )
        assert baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"], (
            f"ZERO stage {zero_stage}: Accuracy should be the same for the baseline and accelerator: {baseline_not_trained['accuracy']} == {accelerator_not_trained['accuracy']}"
        )
        assert baseline_not_trained["f1"] == accelerator_not_trained["f1"], (
            f"ZERO stage {zero_stage}: F1 score should be the same for the baseline and accelerator: {baseline_not_trained['f1']} == {accelerator_not_trained['f1']}"
        )
        assert baseline_trained["accuracy"] == accelerator_trained["accuracy"], (
            f"ZERO stage {zero_stage}: Accuracy should be the same for the baseline and accelerator: {baseline_trained['accuracy']} == {accelerator_trained['accuracy']}"
        )
        assert baseline_trained["f1"] == accelerator_trained["f1"], (
            f"ZERO stage {zero_stage}: F1 score should be the same for the baseline and accelerator: {baseline_trained['f1']} == {accelerator_trained['f1']}"
        )
        AcceleratorState()._reset_state(True)
    torch.distributed.destroy_process_group()


================================================
FILE: benchmarks/fp8/torchao/fp8_utils.py
================================================
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch


def get_dataloaders(model_name: str, batch_size: int = 16):
    from datasets import load_dataset
    from torch.utils.data import DataLoader
    from transformers import AutoTokenizer

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    datasets = load_dataset("glue", "mrpc")

    def tokenize_function(examples):
        # max_length=None => use the model max length (it's actually the default)
        outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
        return outputs

    # Apply the method we just defined to all the examples in all the splits of the dataset
    # starting with the main process first:
    tokenized_datasets = datasets.map(
        tokenize_function,
        batched=True,
        remove_columns=["idx", "sentence1", "sentence2"],
    )

    # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
    # transformers library
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

    def collate_fn(examples):
        return tokenizer.pad(
            examples,
            padding="longest",
            pad_to_multiple_of=16,  # Specific for FP8
            return_tensors="pt",
        )

    # Instantiate dataloaders.
    train_dataloader = DataLoader(
        tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size, drop_last=True
    )
    eval_dataloader = DataLoader(
        tokenized_datasets["validation"],
        shuffle=False,
        collate_fn=collate_fn,
        batch_size=16,
        drop_last=True,
    )

    return train_dataloader, eval_dataloader


def get_training_utilities(model_name: str, batch_size: int = 16, accelerator=None, prepare=True):
    """
    Returns a tuple of:
        - Model
        - Optimizer
        - Train dataloader (prepared)
        - Eval dataloader (prepared)
        - LR Scheduler
    Suitable for training on the MRPC dataset
    """
    from torch.optim import AdamW
    from transformers import AutoModelForSequenceClassification, get_linear_schedule_with_warmup

    from accelerate import Accelerator

    if accelerator is None:
        accelerator = Accelerator()
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    train_dataloader, eval_dataloader = get_dataloaders(model_name, batch_size)
    optimizer = AdamW(model.parameters(), lr=0.0001)
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=100,
        num_training_steps=len(train_dataloader) * 2,
    )
    train_dataloader, eval_dataloader = accelerator.prepare(train_dataloader, eval_dataloader)
    return model, optimizer, train_dataloader, eval_dataloader, lr_scheduler


def get_named_parameters(model):
    """
    Same thing as `Accelerator.get_named_parameters` Returns a list of the named parameters of the model (extracted
    from parallel)
    """
    from accelerate.utils import extract_model_from_parallel

    model = extract_model_from_parallel(model)
    return {n: p for n, p in model.named_parameters()}


def evaluate_model(model, dataloader, metric, accelerator=None):
    "Turns model to .eval(), runs dataloader, calculates metric, then turns eval back on"
    model.eval()
    for step, batch in enumerate(dataloader):
        with torch.no_grad():
            outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        references = batch["labels"]
        if accelerator is not None and accelerator.num_processes > 1:
            predictions, references = accelerator.gather_for_metrics((predictions, references))
        metric.add_batch(predictions=predictions, references=references)
    return metric.compute()


================================================
FILE: benchmarks/fp8/torchao/fsdp.py
================================================
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
This script tests to ensure that `accelerate` performs at the same level as raw `torchao`.

This particular script verifies this for FSDP training.
"""

from functools import partial

import evaluate
import torch
from fp8_utils import get_training_utilities
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
from torch.distributed.fsdp import MixedPrecision
from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
from torchao.float8 import convert_to_float8_training
from transformers.models.bert import BertLayer

from accelerate import Accelerator
from accelerate import FullyShardedDataParallelPlugin as FSDPPlugin
from accelerate.state import AcceleratorState
from accelerate.utils import AORecipeKwargs, set_seed


MODEL_NAME = "bert-base-cased"
METRIC = evaluate.load("glue", "mrpc")

FSDP_WRAP_POLICY = partial(transformer_auto_wrap_policy, transformer_layer_cls={BertLayer})


def filter_linear_layers(module, fqn, first_layer_name=None, last_layer_name=None):
    if isinstance(module, torch.nn.Linear):
        if module.in_features % 16 != 0 or module.out_features % 16 != 0:
            return False
    # For stability reasons, we skip the first and last linear layers
    # Otherwise can lead to the model not training or converging properly
    if fqn in (first_layer_name, last_layer_name):
        return False
    return True


def evaluate_model(model, dataloader, metric, accelerator=None):
    "Turns model to .eval(), runs dataloader, calculates metric, then turns eval back on"
    model.eval()
    for step, batch in enumerate(dataloader):
        with torch.no_grad():
            outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        references = batch["labels"]
        if accelerator is not None and accelerator.num_processes > 1:
            predictions, references = accelerator.gather_for_metrics((predictions, references))
        metric.add_batch(predictions=predictions, references=references)
    return metric.compute()


def train_baseline():
    set_seed(42)
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(MODEL_NAME)
    first_linear = None
    last_linear = None
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            if first_linear is None:
                first_linear = name
            last_linear = name
    func = partial(filter_linear_layers, first_layer_name=first_linear, last_layer_name=last_linear)
    accelerator = Accelerator()
    device = accelerator.device
    model.to(device)

    convert_to_float8_training(model, module_filter_fn=func)

    # Convert the model to FSDP
    model = FSDP(
        model,
        use_orig_params=True,
        mixed_precision=MixedPrecision(param_dtype=torch.bfloat16, reduce_dtype=torch.float32),
        auto_wrap_policy=FSDP_WRAP_POLICY,
    )

    base_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
    model.train()

    for batch in train_dataloader:
        with torch.autocast(device_type=device.type, dtype=torch.bfloat16):
            batch = batch.to(device)
            outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        lr_scheduler.step()

    trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)

    assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
        f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
    )
    assert trained_model_results["f1"] > base_model_results["f1"], (
        f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
    )

    return base_model_results, trained_model_results


def train_integration():
    AcceleratorState()._reset_state(True)
    fsdp_plugin = FSDPPlugin(
        auto_wrap_policy=FSDP_WRAP_POLICY,
        use_orig_params=True,
        mixed_precision_policy=MixedPrecision(param_dtype=torch.bfloat16, reduce_dtype=torch.float32),
    )
    accelerator = Accelerator(mixed_precision="fp8", fsdp_plugin=fsdp_plugin, kwargs_handlers=[AORecipeKwargs()])
    set_seed(42)
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(
        MODEL_NAME, accelerator=accelerator
    )

    model, optimizer = accelerator.prepare(model, optimizer)
    base_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
    model.train()

    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)
        optimizer.step()
        optimizer.zero_grad()
        lr_scheduler.step()

    trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)

    assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
        f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
    )
    assert trained_model_results["f1"] > base_model_results["f1"], (
        f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
    )

    return base_model_results, trained_model_results


if __name__ == "__main__":
    baseline_not_trained, baseline_trained = train_baseline()
    accelerator_not_trained, accelerator_trained = train_integration()

    assert baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"], (
        f"Accuracy should be the same for the baseline and accelerator: {baseline_not_trained['accuracy']} == {accelerator_not_trained['accuracy']}"
    )
    assert baseline_not_trained["f1"] == accelerator_not_trained["f1"], (
        f"F1 score should be the same for the baseline and accelerator: {baseline_not_trained['f1']} == {accelerator_not_trained['f1']}"
    )
    assert baseline_trained["accuracy"] == accelerator_trained["accuracy"], (
        f"Accuracy should be the same for the baseline and accelerator: {baseline_trained['accuracy']} == {accelerator_trained['accuracy']}"
    )
    assert baseline_trained["f1"] == accelerator_trained["f1"], (
        f"F1 score should be the same for the baseline and accelerator: {baseline_trained['f1']} == {accelerator_trained['f1']}"
    )

    torch.distributed.destroy_process_group()


================================================
FILE: benchmarks/fp8/torchao/non_distributed.py
================================================
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
This script tests to ensure that `accelerate` performs at the same level as raw `torchao`.

This particular script verifies this for single GPU training.
"""

from functools import partial

import evaluate
import torch
from fp8_utils import get_training_utilities
from torchao.float8 import convert_to_float8_training

from accelerate import Accelerator
from accelerate.state import AcceleratorState
from accelerate.utils import AORecipeKwargs, set_seed


MODEL_NAME = "bert-base-cased"
METRIC = evaluate.load("glue", "mrpc")


def evaluate_model(model, dataloader, metric, accelerator=None):
    "Turns model to .eval(), runs dataloader, calculates metric, then turns eval back on"
    model.eval()
    for step, batch in enumerate(dataloader):
        with torch.no_grad():
            outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        references = batch["labels"]
        if accelerator is not None and accelerator.num_processes > 1:
            predictions, references = accelerator.gather_for_metrics((predictions, references))
        metric.add_batch(predictions=predictions, references=references)
    return metric.compute()


def filter_linear_layers(module, fqn, first_layer_name=None, last_layer_name=None):
    if isinstance(module, torch.nn.Linear):
        if module.in_features % 16 != 0 or module.out_features % 16 != 0:
            return False
    # For stability reasons, we skip the first and last linear layers
    # Otherwise can lead to the model not training or converging properly
    if fqn in (first_layer_name, last_layer_name):
        return False
    return True


def train_baseline():
    set_seed(42)
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(MODEL_NAME)
    first_linear = None
    last_linear = None
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            if first_linear is None:
                first_linear = name
            last_linear = name

    func = partial(filter_linear_layers, first_layer_name=first_linear, last_layer_name=last_linear)
    accelerator = Accelerator()
    device = accelerator.device
    model.to(device)
    convert_to_float8_training(model, module_filter_fn=func)
    base_model_results = evaluate_model(model, eval_dataloader, METRIC)
    model.train()

    for batch in train_dataloader:
        with torch.autocast(device_type=device.type, dtype=torch.bfloat16):
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        lr_scheduler.step()

    trained_model_results = evaluate_model(model, eval_dataloader, METRIC)

    assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
        f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
    )
    assert trained_model_results["f1"] > base_model_results["f1"], (
        f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
    )

    return base_model_results, trained_model_results


def train_integration():
    set_seed(42)
    accelerator = Accelerator(mixed_precision="fp8", kwargs_handlers=[AORecipeKwargs()])
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(
        MODEL_NAME, accelerator=accelerator
    )
    model = accelerator.prepare(model)
    base_model_results = evaluate_model(model, eval_dataloader, METRIC)
    model.train()

    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        lr_scheduler.step()

    trained_model_results = evaluate_model(model, eval_dataloader, METRIC)

    assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
        f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
    )
    assert trained_model_results["f1"] > base_model_results["f1"], (
        f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
    )

    return base_model_results, trained_model_results


if __name__ == "__main__":
    baseline_not_trained, baseline_trained = train_baseline()
    AcceleratorState._reset_state(True)
    accelerator_not_trained, accelerator_trained = train_integration()
    assert baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"], (
        f"Accuracy should be the same for the baseline and accelerator: {baseline_not_trained['accuracy']} == {accelerator_not_trained['accuracy']}"
    )
    assert baseline_not_trained["f1"] == accelerator_not_trained["f1"], (
        f"F1 score should be the same for the baseline and accelerator: {baseline_not_trained['f1']} == {accelerator_not_trained['f1']}"
    )
    assert baseline_trained["accuracy"] == accelerator_trained["accuracy"], (
        f"Accuracy should be the same for the baseline and accelerator: {baseline_trained['accuracy']} == {accelerator_trained['accuracy']}"
    )
    assert baseline_trained["f1"] == accelerator_trained["f1"], (
        f"F1 score should be the same for the baseline and accelerator: {baseline_trained['f1']} == {accelerator_trained['f1']}"
    )


================================================
FILE: benchmarks/fp8/transformer_engine/Dockerfile
================================================
ARG BASE_YEAR=25
ARG BASE_MONTH=03

FROM nvcr.io/nvidia/pytorch:${BASE_YEAR}.${BASE_MONTH}-py3

RUN pip install transformers evaluate datasets
RUN git clone https://github.com/huggingface/accelerate.git

RUN cd accelerate && \
    pip install -e .[deepspeed] && \
    cd benchmarks/fp8

RUN /bin/bash


================================================
FILE: benchmarks/fp8/transformer_engine/README.md
================================================
# FP8 Benchmarks

Comparing and running [TransformerEngine](https://github.com/NVIDIA/TransformerEngine) FP8 with accelerate

## Overview

This repo provides scripts which compare native TransformerEngine model training against `accelerate`'s own integration. Each modeling type is segmented out via a script, supporting the following:

* Single GPU training (`non_distributed.py`)
* Multi-GPU training via DistributedDataParallelism (`ddp.py`)
* Fully Sharded Data Parallelism (`fsdp.py`)
* DeepSpeed ZeRO 1-3 (`deepspeed.py`)

To run them, it's recommended to use a docker image (see the attached `Dockerfile`) and not install `TransformerEngine` manually.

## Running:

There are official Docker images located at `huggingface/accelerate:gpu-fp8-transformerengine-nightly` which can be used.

You can run all scripts using the core `accelerate launch` command without any `accelerate config` being needed.

For single GPU, run it via `python`:

```bash
python non_distributed.py
```

For the rest, run it via `accelerate launch`:

```bash
accelerate launch ddp.py # or distrib_deepspeed.py, ddp.py
```

================================================
FILE: benchmarks/fp8/transformer_engine/ddp.py
================================================
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
This script tests to ensure that `accelerate` performs at the same level as raw `TransformersEngine`.

This particular script verifies this for DDP training.
"""

import evaluate
import torch
import transformer_engine.common.recipe as te_recipe
import transformer_engine.pytorch as te
from fp8_utils import evaluate_model, get_named_parameters, get_training_utilities
from torch.nn.parallel import DistributedDataParallel as DDP
from transformer_engine.common.recipe import DelayedScaling

from accelerate import Accelerator
from accelerate.state import AcceleratorState
from accelerate.utils import FP8RecipeKwargs, set_seed
from accelerate.utils.transformer_engine import convert_model


MODEL_NAME = "bert-base-cased"
METRIC = evaluate.load("glue", "mrpc")


def train_baseline():
    set_seed(42)
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(MODEL_NAME)
    accelerator = Accelerator()
    device = accelerator.device
    model.to(device)

    # Convert the model to TE
    old_named_params = get_named_parameters(model)

    with torch.no_grad():
        convert_model(model)

    FP8_RECIPE_KWARGS = {"fp8_format": te_recipe.Format.HYBRID, "amax_history_len": 32, "amax_compute_algo": "max"}
    fp8_recipe = DelayedScaling(**FP8_RECIPE_KWARGS)

    new_named_params = get_named_parameters(model)

    # Convert the model to DDP
    device_ids, output_device = [accelerator.local_process_index], accelerator.local_process_index
    model = DDP(model, device_ids=device_ids, output_device=output_device)

    mapping = {p: new_named_params[n] for n, p in old_named_params.items()}
    for param_group in optimizer.param_groups:
        param_group["params"] = [mapping[p] for p in param_group["params"]]

    base_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
    model.train()

    for _ in range(2):
        for batch in train_dataloader:
            with te.fp8_autocast(enabled=True, fp8_recipe=fp8_recipe):
                with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
                    batch = batch.to(device)
                    outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            lr_scheduler.step()

    trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)

    assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
        f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
    )
    assert trained_model_results["f1"] > base_model_results["f1"], (
        f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
    )

    return base_model_results, trained_model_results


def train_integration():
    FP8_RECIPE_KWARGS = {"fp8_format": "HYBRID", "amax_history_len": 32, "amax_compute_algo": "max"}
    kwargs_handlers = [FP8RecipeKwargs(backend="TE", **FP8_RECIPE_KWARGS)]
    AcceleratorState()._reset_state(True)
    accelerator = Accelerator(mixed_precision="fp8", kwargs_handlers=kwargs_handlers)
    set_seed(42)
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(
        MODEL_NAME, accelerator=accelerator
    )

    model, optimizer = accelerator.prepare(model, optimizer)
    base_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
    model.train()

    for _ in range(2):
        for batch in train_dataloader:
            outputs = model(**batch)
            loss = outputs.loss
            accelerator.backward(loss)
            optimizer.step()
            optimizer.zero_grad()
            lr_scheduler.step()

    trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)

    assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
        f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
    )
    assert trained_model_results["f1"] > base_model_results["f1"], (
        f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
    )

    return base_model_results, trained_model_results


if __name__ == "__main__":
    baseline_not_trained, baseline_trained = train_baseline()
    accelerator_not_trained, accelerator_trained = train_integration()

    assert baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"], (
        f"Accuracy should be the same for the baseline and accelerator: {baseline_not_trained['accuracy']} == {accelerator_not_trained['accuracy']}"
    )
    assert baseline_not_trained["f1"] == accelerator_not_trained["f1"], (
        f"F1 score should be the same for the baseline and accelerator: {baseline_not_trained['f1']} == {accelerator_not_trained['f1']}"
    )
    assert baseline_trained["accuracy"] == accelerator_trained["accuracy"], (
        f"Accuracy should be the same for the baseline and accelerator: {baseline_trained['accuracy']} == {accelerator_trained['accuracy']}"
    )
    assert baseline_trained["f1"] == accelerator_trained["f1"], (
        f"F1 score should be the same for the baseline and accelerator: {baseline_trained['f1']} == {accelerator_trained['f1']}"
    )

    torch.distributed.destroy_process_group()


================================================
FILE: benchmarks/fp8/transformer_engine/distrib_deepspeed.py
================================================
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
This script tests to ensure that `accelerate` performs at the same level as raw `TransformersEngine`.

This particular script verifies this for DDP training.
"""

from unittest.mock import patch

import deepspeed
import evaluate
import torch
import transformer_engine.common.recipe as te_recipe
import transformer_engine.pytorch as te
from fp8_utils import evaluate_model, get_named_parameters, get_training_utilities
from transformer_engine.common.recipe import DelayedScaling

from accelerate import Accelerator, DeepSpeedPlugin
from accelerate.state import AcceleratorState
from accelerate.utils import FP8RecipeKwargs, set_seed
from accelerate.utils.transformer_engine import convert_model


MODEL_NAME = "bert-base-cased"
METRIC = evaluate.load("glue", "mrpc")


def train_baseline(zero_stage: int = 1):
    # This forces transformers to think Zero-3 Init should be used
    with patch("transformers.integrations.deepspeed.is_deepspeed_zero3_enabled") as mock:
        mock.return_value = zero_stage == 3
    set_seed(42)

    accelerator = Accelerator()
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(
        MODEL_NAME, accelerator=accelerator
    )

    # Convert the model to TE
    old_named_params = get_named_parameters(model)

    with torch.no_grad():
        convert_model(model)
    new_named_params = get_named_parameters(model)

    mapping = {p: new_named_params[n] for n, p in old_named_params.items()}
    for param_group in optimizer.param_groups:
        param_group["params"] = [mapping[p] for p in param_group["params"]]

    FP8_RECIPE_KWARGS = {"fp8_format": te_recipe.Format.HYBRID, "amax_history_len": 32, "amax_compute_algo": "max"}
    fp8_recipe = DelayedScaling(**FP8_RECIPE_KWARGS)

    import numpy as np

    config = {
        "train_batch_size": 16,
        "train_micro_batch_size_per_gpu": 16,
        "gradient_accumulation_steps": 1,
        "zero_optimization": {
            "stage": zero_stage,
            "offload_optimizer": {"device": "none", "nvme_path": None},
            "offload_param": {"device": "none", "nvme_path": None},
            "stage3_gather_16bit_weights_on_model_save": False,
        },
        "gradient_clipping": 1.0,
        "steps_per_print": np.inf,
        "bf16": {"enabled": True},
        "fp16": {"enabled": False},
        "zero_allow_untested_optimizer": True,
    }

    (
        model,
        optimizer,
        _,
        _,
    ) = deepspeed.initialize(
        model=model,
        optimizer=optimizer,
        config_params=config,
    )

    base_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
    model.train()

    model_outputs = []
    data = []

    for _ in range(2):
        for batch in train_dataloader:
            with te.fp8_autocast(enabled=True, fp8_recipe=fp8_recipe):
                outputs = model(**batch)
                data.append(batch.to("cpu"))
            model_outputs.append(outputs.logits.to("cpu"))
            loss = outputs.loss
            model.backward(loss)
            model.step()
            for _ in range(accelerator.num_processes):
                lr_scheduler.step()

    trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
    model.destroy()
    assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
        f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
    )
    assert trained_model_results["f1"] > base_model_results["f1"], (
        f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
    )

    return base_model_results, trained_model_results, model_outputs, data


def train_integration(zero_stage: int = 1):
    set_seed(42)
    FP8_RECIPE_KWARGS = {"fp8_format": "HYBRID", "amax_history_len": 32, "amax_compute_algo": "max"}
    kwargs_handlers = [FP8RecipeKwargs(backend="TE", **FP8_RECIPE_KWARGS)]
    AcceleratorState()._reset_state(True)
    deepspeed_plugin = DeepSpeedPlugin(
        zero_stage=zero_stage,
        zero3_init_flag=zero_stage == 3,
    )
    accelerator = Accelerator(
        mixed_precision="fp8", kwargs_handlers=kwargs_handlers, deepspeed_plugin=deepspeed_plugin
    )
    accelerator.state.deepspeed_plugin.deepspeed_config["train_micro_batch_size_per_gpu"] = 16

    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(
        MODEL_NAME, accelerator=accelerator
    )

    model, optimizer, lr_scheduler = accelerator.prepare(model, optimizer, lr_scheduler)
    base_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
    model.train()
    model_outputs = []
    data = []
    for _ in range(2):
        for batch in train_dataloader:
            outputs = model(**batch)
            data.append(batch.to("cpu"))
            model_outputs.append(outputs.logits.to("cpu"))
            loss = outputs.loss
            accelerator.backward(loss)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

    trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
    model.destroy()
    assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
        f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
    )
    assert trained_model_results["f1"] > base_model_results["f1"], (
        f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
    )

    return base_model_results, trained_model_results, model_outputs, data


if __name__ == "__main__":
    for zero_stage in [1, 2, 3]:
        baseline_not_trained, baseline_trained, baseline_outputs, baseline_data = train_baseline(zero_stage)
        accelerator_not_trained, accelerator_trained, accelerator_outputs, accelerator_data = train_integration(
            zero_stage
        )
        assert baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"], (
            f"ZERO stage {zero_stage}: Accuracy should be the same for the baseline and accelerator: {baseline_not_trained['accuracy']} == {accelerator_not_trained['accuracy']}"
        )
        assert baseline_not_trained["f1"] == accelerator_not_trained["f1"], (
            f"ZERO stage {zero_stage}: F1 score should be the same for the baseline and accelerator: {baseline_not_trained['f1']} == {accelerator_not_trained['f1']}"
        )
        assert baseline_trained["accuracy"] == accelerator_trained["accuracy"], (
            f"ZERO stage {zero_stage}: Accuracy should be the same for the baseline and accelerator: {baseline_trained['accuracy']} == {accelerator_trained['accuracy']}"
        )
        assert baseline_trained["f1"] == accelerator_trained["f1"], (
            f"ZERO stage {zero_stage}: F1 score should be the same for the baseline and accelerator: {baseline_trained['f1']} == {accelerator_trained['f1']}"
        )

        torch.distributed.destroy_process_group()


================================================
FILE: benchmarks/fp8/transformer_engine/fp8_utils.py
================================================
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch


def get_dataloaders(model_name: str, batch_size: int = 16):
    from datasets import load_dataset
    from torch.utils.data import DataLoader
    from transformers import AutoTokenizer

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    datasets = load_dataset("glue", "mrpc")

    def tokenize_function(examples):
        # max_length=None => use the model max length (it's actually the default)
        outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
        return outputs

    # Apply the method we just defined to all the examples in all the splits of the dataset
    # starting with the main process first:
    tokenized_datasets = datasets.map(
        tokenize_function,
        batched=True,
        remove_columns=["idx", "sentence1", "sentence2"],
    )

    # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
    # transformers library
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

    def collate_fn(examples):
        return tokenizer.pad(
            examples,
            padding="longest",
            pad_to_multiple_of=16,  # Specific for FP8
            return_tensors="pt",
        )

    # Instantiate dataloaders.
    train_dataloader = DataLoader(
        tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size, drop_last=True
    )
    eval_dataloader = DataLoader(
        tokenized_datasets["validation"],
        shuffle=False,
        collate_fn=collate_fn,
        batch_size=16,
        drop_last=True,
    )

    return train_dataloader, eval_dataloader


def get_training_utilities(model_name: str, batch_size: int = 16, accelerator=None):
    """
    Returns a tuple of:
        - Model
        - Optimizer
        - Train dataloader (prepared)
        - Eval dataloader (prepared)
        - LR Scheduler
    Suitable for training on the MRPC dataset
    """
    from torch.optim import AdamW
    from transformers import AutoModelForSequenceClassification, get_linear_schedule_with_warmup

    from accelerate import Accelerator

    if accelerator is None:
        accelerator = Accelerator()
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    train_dataloader, eval_dataloader = get_dataloaders(model_name, batch_size)
    optimizer = AdamW(model.parameters(), lr=0.0001)
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=100,
        num_training_steps=len(train_dataloader) * 2,
    )
    train_dataloader, eval_dataloader = accelerator.prepare(train_dataloader, eval_dataloader)
    return model, optimizer, train_dataloader, eval_dataloader, lr_scheduler


def get_named_parameters(model):
    """
    Same thing as `Accelerator.get_named_parameters` Returns a list of the named parameters of the model (extracted
    from parallel)
    """
    from accelerate.utils import extract_model_from_parallel

    model = extract_model_from_parallel(model)
    return {n: p for n, p in model.named_parameters()}


def evaluate_model(model, dataloader, metric, accelerator=None):
    "Turns model to .eval(), runs dataloader, calculates metric, then turns eval back on"
    model.eval()
    for step, batch in enumerate(dataloader):
        with torch.no_grad():
            outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        references = batch["labels"]
        if accelerator is not None and accelerator.num_processes > 1:
            predictions, references = accelerator.gather_for_metrics((predictions, references))
        metric.add_batch(predictions=predictions, references=references)
    return metric.compute()


================================================
FILE: benchmarks/fp8/transformer_engine/fsdp.py
================================================
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
This script tests to ensure that `accelerate` performs at the same level as raw `TransformersEngine`.

This particular script verifies this for FSDP training.
"""

from functools import partial

import evaluate
import torch
import transformer_engine.common.recipe as te_recipe
import transformer_engine.pytorch as te
from fp8_utils import evaluate_model, get_named_parameters, get_training_utilities
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
from torch.distributed.fsdp import MixedPrecision
from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
from transformer_engine.common.recipe import DelayedScaling
from transformers.models.bert import BertLayer

from accelerate import Accelerator
from accelerate import FullyShardedDataParallelPlugin as FSDPPlugin
from accelerate.state import AcceleratorState
from accelerate.utils import FP8RecipeKwargs, set_seed
from accelerate.utils.transformer_engine import convert_model


MODEL_NAME = "bert-base-cased"
METRIC = evaluate.load("glue", "mrpc")

FSDP_WRAP_POLICY = partial(transformer_auto_wrap_policy, transformer_layer_cls={BertLayer})


def train_baseline():
    set_seed(42)
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(MODEL_NAME)
    accelerator = Accelerator()
    device = accelerator.device
    model.to(device)

    # Convert the model to TE
    old_named_params = get_named_parameters(model)

    with torch.no_grad():
        convert_model(model)

    FP8_RECIPE_KWARGS = {"fp8_format": te_recipe.Format.HYBRID, "amax_history_len": 32, "amax_compute_algo": "max"}
    fp8_recipe = DelayedScaling(**FP8_RECIPE_KWARGS)

    new_named_params = get_named_parameters(model)

    # Convert the model to FSDP
    model = FSDP(
        model,
        use_orig_params=True,
        mixed_precision=MixedPrecision(param_dtype=torch.bfloat16, reduce_dtype=torch.float32),
        auto_wrap_policy=FSDP_WRAP_POLICY,
    )

    mapping = {p: new_named_params[n] for n, p in old_named_params.items()}
    for param_group in optimizer.param_groups:
        param_group["params"] = [mapping[p] for p in param_group["params"]]

    base_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
    model.train()

    for _ in range(2):
        for batch in train_dataloader:
            with te.fp8_autocast(enabled=True, fp8_recipe=fp8_recipe):
                with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
                    batch = batch.to(device)
                    outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            lr_scheduler.step()

    trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)

    assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
        f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
    )
    assert trained_model_results["f1"] > base_model_results["f1"], (
        f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
    )

    return base_model_results, trained_model_results


def train_integration():
    FP8_RECIPE_KWARGS = {"fp8_format": "HYBRID", "amax_history_len": 32, "amax_compute_algo": "max"}
    kwargs_handlers = [FP8RecipeKwargs(backend="TE", **FP8_RECIPE_KWARGS)]
    AcceleratorState()._reset_state(True)
    fsdp_plugin = FSDPPlugin(
        auto_wrap_policy=FSDP_WRAP_POLICY,
        use_orig_params=True,
        mixed_precision_policy=MixedPrecision(param_dtype=torch.bfloat16, reduce_dtype=torch.float32),
    )
    accelerator = Accelerator(mixed_precision="fp8", fsdp_plugin=fsdp_plugin, kwargs_handlers=kwargs_handlers)
    set_seed(42)
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(
        MODEL_NAME, accelerator=accelerator
    )

    model, optimizer = accelerator.prepare(model, optimizer)
    base_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
    model.train()

    for _ in range(2):
        for batch in train_dataloader:
            outputs = model(**batch)
            loss = outputs.loss
            accelerator.backward(loss)
            optimizer.step()
            optimizer.zero_grad()
            lr_scheduler.step()

    trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)

    assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
        f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
    )
    assert trained_model_results["f1"] > base_model_results["f1"], (
        f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
    )

    return base_model_results, trained_model_results


if __name__ == "__main__":
    baseline_not_trained, baseline_trained = train_baseline()
    accelerator_not_trained, accelerator_trained = train_integration()

    assert baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"], (
        f"Accuracy should be the same for the baseline and accelerator: {baseline_not_trained['accuracy']} == {accelerator_not_trained['accuracy']}"
    )
    assert baseline_not_trained["f1"] == accelerator_not_trained["f1"], (
        f"F1 score should be the same for the baseline and accelerator: {baseline_not_trained['f1']} == {accelerator_not_trained['f1']}"
    )
    assert baseline_trained["accuracy"] == accelerator_trained["accuracy"], (
        f"Accuracy should be the same for the baseline and accelerator: {baseline_trained['accuracy']} == {accelerator_trained['accuracy']}"
    )
    assert baseline_trained["f1"] == accelerator_trained["f1"], (
        f"F1 score should be the same for the baseline and accelerator: {baseline_trained['f1']} == {accelerator_trained['f1']}"
    )

    torch.distributed.destroy_process_group()


================================================
FILE: benchmarks/fp8/transformer_engine/non_distributed.py
================================================
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
This script tests to ensure that `accelerate` performs at the same level as raw `TransformersEngine`.

This particular script verifies this for single GPU training.
"""

import evaluate
import torch
import transformer_engine.common.recipe as te_recipe
import transformer_engine.pytorch as te
from fp8_utils import evaluate_model, get_named_parameters, get_training_utilities
from transformer_engine.common.recipe import DelayedScaling

from accelerate import Accelerator
from accelerate.state import AcceleratorState
from accelerate.utils import FP8RecipeKwargs, set_seed
from accelerate.utils.transformer_engine import convert_model


MODEL_NAME = "bert-base-cased"
METRIC = evaluate.load("glue", "mrpc")


def train_baseline():
    set_seed(42)
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(MODEL_NAME)

    # Convert the model to TE
    old_named_params = get_named_parameters(model)

    with torch.no_grad():
        convert_model(model)

    new_named_params = get_named_parameters(model)
    mapping = {p: new_named_params[n] for n, p in old_named_params.items()}
    for param_group in optimizer.param_groups:
        param_group["params"] = [mapping[p] for p in param_group["params"]]

    FP8_RECIPE_KWARGS = {"fp8_format": te_recipe.Format.HYBRID, "amax_history_len": 32, "amax_compute_algo": "max"}
    fp8_recipe = DelayedScaling(**FP8_RECIPE_KWARGS)

    model.to("cuda")
    base_model_results = evaluate_model(model, eval_dataloader, METRIC)
    model.train()

    for batch in train_dataloader:
        with te.fp8_autocast(enabled=True, fp8_recipe=fp8_recipe):
            with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
                batch = batch.to("cuda")
                outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        lr_scheduler.step()

    trained_model_results = evaluate_model(model, eval_dataloader, METRIC)

    assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
        f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
    )
    assert trained_model_results["f1"] > base_model_results["f1"], (
        f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
    )

    return base_model_results, trained_model_results


def train_integration():
    FP8_RECIPE_KWARGS = {"fp8_format": "HYBRID", "amax_history_len": 32, "amax_compute_algo": "max"}
    kwargs_handlers = [FP8RecipeKwargs(backend="TE", **FP8_RECIPE_KWARGS)]
    AcceleratorState()._reset_state(True)
    accelerator = Accelerator(mixed_precision="fp8", kwargs_handlers=kwargs_handlers)
    set_seed(42)
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(
        MODEL_NAME, accelerator=accelerator
    )

    model, optimizer, lr_scheduler = accelerator.prepare(model, optimizer, lr_scheduler)
    base_model_results = evaluate_model(model, eval_dataloader, METRIC)
    model.train()

    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)
        optimizer.step()
        optimizer.zero_grad()
        lr_scheduler.step()

    trained_model_results = evaluate_model(model, eval_dataloader, METRIC)

    assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
        f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
    )
    assert trained_model_results["f1"] > base_model_results["f1"], (
        f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
    )

    return base_model_results, trained_model_results


if __name__ == "__main__":
    baseline_not_trained, baseline_trained = train_baseline()
    accelerator_not_trained, accelerator_trained = train_integration()

    assert baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"], (
        f"Accuracy should be the same for the baseline and accelerator: {baseline_not_trained['accuracy']} == {accelerator_not_trained['accuracy']}"
    )
    assert baseline_not_trained["f1"] == accelerator_not_trained["f1"], (
        f"F1 score should be the same for the baseline and accelerator: {baseline_not_trained['f1']} == {accelerator_not_trained['f1']}"
    )
    assert baseline_trained["accuracy"] == accelerator_trained["accuracy"], (
        f"Accuracy should be the same for the baseline and accelerator: {baseline_trained['accuracy']} == {accelerator_trained['accuracy']}"
    )
    assert baseline_trained["f1"] == accelerator_trained["f1"], (
        f"F1 score should be the same for the baseline and accelerator: {baseline_trained['f1']} == {accelerator_trained['f1']}"
    )


================================================
FILE: benchmarks/fsdp2/README.md
================================================
# FSDP2 Benchmarks

This benchmark showcases `FSDP2` in 🤗 `accelerate` and compares it to `torch` baseline.

## Overview

This benchmark consists of two parts:
- `main.py` is the main script that runs the benchmark
- `visualize.py` is the script that visualizes the results (if `--output_dir` was specified for the previous command)

## Motivation

We want to showcase that 🤗 `accelerate`'s integration of `FSDP2` is on par raw PyTorch, and highlight a "broken" part in PyTorch that creating an optimizer before applying `FSDP2` **doesn't result in a working training loop**. (more on this later)
This script showcases **matching memory usage and convergence between `accelerate` and `torch`'s baseline.**
To deal with this breaking change (and maintain backward compatibility with FSDP1 in terms of an API), `accelerate` had to come up with a workaround since `accelerate` assumes that the user will nearly always create a model, optimizer, scheduler, etc beforehand and bring them themselves. This lead to an issue of a stark increase in memory as well as the model not even training if the user creates an optimizer beforehand. 
To workaround this, we replace the parameters inside the optimizer with the newly created FSDP2 sharded ones. More about this can be found in this [blog post (TBD)](TODO)
> [!WARNING]
> This script is intended to fit on 2x 24GB GPUs, though on so few GPUs it's not possible to see the memory difference (discrepancies in grad allocation result in lower memory usage in the non-fixed case), only the difference in convergence. Below are attached results from 8x H100 GPUs where the difference is visible.
> TLDR: more GPUs = bigger memory difference between fixed and non-fixed cases.

## Results

Here are the results from running the benchmark on 8x H100 GPUs:

<p align="center">
  <img src="imgs/allocated_memory.png" width="80%" alt="Allocated Memory Usage">
</p>
<p align="center">
  <img src="imgs/reserved_memory.png" width="80%" alt="Reserved Memory Usage">
</p>

As you can see, the memory usage of `accelerate` and `torch_post_shard` (the **intended** way) are very similar, while `torch_pre_shard_not_fixed` uses significantly more memory. Our fix in `torch_pre_shard_fixed` brings the memory usage back in line with the **intended** approach.

> [!WARNING]
> Timing discrepancies are due to the benchmarks being ran in 1 script.


## Running

To run the benchmark, you can either use `accelerate launch` or `torchrun`:
```bash
accelerate launch main.py
```
```bash
# For two GPUs
torchrun --nproc_per_node 2 main.py
```

This supports multiple configurable options, you can learn about them by running:
```bash
python3 main.py --help
```

This script will run 4 different benchmarks:
- `torch_optimizer_after_fsdp`: `torch` baseline where optimizer is created after applying `FSDP2`, this is the **intended** way to do it
- `torch_optimizer_before_fsdp_not_fixed`: `torch` baseline where optimizer is created before applying `FSDP2` without fixing the optimizer parameters
- `torch_optimizer_before_fsdp_fixed`: `torch` baseline where optimizer is created before applying `FSDP2` with our fix to the optimizer
- `accelerate`: `accelerate`'s own integration of `FSDP2` where optimizer is created before applying `FSDP2`, but we apply our fix to the optimizer

Memory results are saved in a folder specified by `--output_dir` argument.
Optionally, you can specify `--save_memory_snapshot` to save the torch memory snapshot, which can then be viewed using [`torch memory viz`](https://pytorch.org/memory_viz)

## Visualizing results

To visualize the results, you can run:

```bash
python3 visualize.py --dir <path_to_output_dir>
```

This will then create two plots, showcasing allocated and reserved memory usage between all the different benchmarks discussed above.


================================================
FILE: benchmarks/fsdp2/main.py
================================================
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import functools
from typing import Callable

import torch

from accelerate import Accelerator
from utils import parse_args, prepare_accelerate, prepare_torch


MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
LEARNING_RATE = 3e-5

CONFIG = {
    "model_name": MODEL_NAME,
    "learning_rate": LEARNING_RATE,
}


def train(
    model: torch.nn.Module,
    optimizer: torch.optim.Optimizer,
    train_dataloader: torch.utils.data.DataLoader,
    accelerator: Accelerator,
) -> torch.Tensor:
    losses = []
    for batch in train_dataloader:
        optimizer.zero_grad()
        outputs = model(**batch, use_cache=False)

        loss = outputs.loss
        losses.append(loss.item())
        accelerator.backward(loss)
        optimizer.step()

    return torch.tensor(losses)


def evaluate(args, config: dict, init_fn: Callable, run_name: str) -> torch.Tensor:
    model, optimizer, dataloader, accelerator, memory_tracker = init_fn(args, config)

    loss = train(model, optimizer, dataloader, accelerator)

    memory_tracker.stop()
    msg = f"""Results for {run_name} (rank 0):
Loss: {loss[-1].item()}
Peak Allocated Memory: {float(memory_tracker.peak_allocated_memory):.2f} MB
Peak Reserved Memory: {float(memory_tracker.peak_reserved_memory):.2f} MB
{"-" * 34}"""
    accelerator.print(msg)
    return loss


def main():
    args = parse_args()
    evaluations = [
        functools.partial(
            evaluate,
            init_fn=functools.partial(prepare_torch, post_shard_optimizer=False, apply_optimizer_fix=True),
            run_name="Optimizer Before FSDP (w/ fix)",
        ),
        functools.partial(
            evaluate,
            init_fn=functools.partial(prepare_torch, post_shard_optimizer=False, apply_optimizer_fix=False),
            run_name="Optimizer Before FSDP (w/o fix)",
        ),
        functools.partial(
            evaluate,
            init_fn=functools.partial(prepare_torch, post_shard_optimizer=True),
            run_name="Optimizer After FSDP",
        ),
        functools.partial(evaluate, init_fn=prepare_accelerate, run_name="Accelerate"),
    ]
    labels = [
        "Optimizer Before FSDP (w/ fix)",
        "Optimizer Before FSDP (w/o fix)",
        "Optimizer After FSDP",
        "Accelerate",
    ]

    results = {}
    torch.use_deterministic_algorithms(True)

    for evaluation, label in zip(evaluations, labels):
        results[label] = evaluation(args, CONFIG)

    torch.testing.assert_close(
        results["Optimizer After FSDP"],
        results["Optimizer Before FSDP (w/ fix)"],
        msg="Optimizer After FSDP and Optimizer Before FSDP (w/ fix) should be the same",
    )

    torch.testing.assert_close(
        results["Optimizer After FSDP"],
        results["Accelerate"],
        msg="Optimizer After FSDP and Accelerate should be the same",
    )

    torch.testing.assert_close(
        results["Accelerate"],
        results["Optimizer Before FSDP (w/ fix)"],
        msg="Accelerate and Optimizer Before FSDP (w/ fix) should be the same",
    )

    torch.distributed.destroy_process_group()


if __name__ == "__main__":
    main()


================================================
FILE: benchmarks/fsdp2/measure_utils.py
================================================
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import gc
import json
import os
import threading
import time

import psutil
import torch

from accelerate import PartialState


class MemoryTracker:
    def __init__(
        self,
        device: torch.device,
        output_directory: str,
        run_name: str,
        save_memory_snapshot: bool,
        log_interval: float = 0.01,
    ):
        """Class for tracking gpu and cpu memory usage of the process.

        Args:
            device (`torch.device`):
                PyTorch device to monitor.
            output_directory (`str`):
                Directory to save the memory usage data to, will be created if it doesn't exist.
            run_name (`str`):
                Name of the run, will be used to name the output files.
            save_memory_snapshot (`bool`):
                Whether to also save `torch.cuda.memory._dump_snapshot` to the output directory.
            log_interval (`float`, *optional*):
                Interval in seconds between memory measurements. Defaults to 0.01.
        """
        self.log_interval = log_interval
        self.save_memory_snapshot = save_memory_snapshot
        self.output_directory = output_directory
        self.run_name = run_name

        self.timestamps = []
        self.allocated_memory = []
        self.reserved_memory = []
        self.virtual_memory = []

        self.start_time = None
        self.running = False

        self._thread = None
        self._state = PartialState()
        self._process = psutil.Process()
        self._device = device
        self.torch_accelerator_module = getattr(torch, device.type, torch.cuda)

    def _monitor(self):
        self.start_time = time.time()

        while self.running:
            allocated = self.torch_accelerator_module.memory_allocated(self._device) / (1024 * 1024)
            reserved = self.torch_accelerator_module.memory_reserved(self._device) / (1024 * 1024)
            virtual_memory = self._process.memory_info().rss / (1024 * 1024)

            self.allocated_memory.append(allocated)
            self.reserved_memory.append(reserved)
            self.virtual_memory.append(virtual_memory)
            self.timestamps.append(time.time() - self.start_time)

            time.sleep(self.log_interval)

    def start(self):
        gc.collect()
        self.torch_accelerator_module.empty_cache()

        if self.output_directory:
            os.makedirs(self.output_directory, exist_ok=True)

        if self.save_memory_snapshot:
            self.torch_accelerator_module.memory._record_memory_history()

        self.running = True
        self._thread = threading.Thread(target=self._monitor)
        self._thread.daemon = True
        self._thread.start()

    def stop(self):
        self.running = False
        if self._thread:
            self._thread.join()

        if self.save_memory_snapshot and self._state.is_main_process and self.output_directory:
            output_file = os.path.join(self.output_directory, f"{self.run_name}_memory_snapshot.pkl")
            self.torch_accelerator_module.memory._dump_snapshot(output_file)

        if self._state.is_main_process and self.output_directory:
            path = os.path.join(self.output_directory, f"{self.run_name}_memory_usage.json")
            with open(path, "w") as f:
                json.dump(
                    {
                        "timestamps": self.timestamps,
                        "allocated_memory": self.allocated_memory,
                        "reserved_memory": self.reserved_memory,
                        "virtual_memory": self.virtual_memory,
                    },
                    f,
                )
        if self.save_memory_snapshot:
            self.torch_accelerator_module.memory._record_memory_history(False)
        self.torch_accelerator_module.empty_cache()

    @property
    def peak_allocated_memory(self):
        return max(self.allocated_memory)

    @property
    def peak_reserved_memory(self):
        return max(self.reserved_memory)


================================================
FILE: benchmarks/fsdp2/utils.py
================================================
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
from types import MethodType
from typing import Union

import torch
from datasets import load_dataset
from measure_utils import MemoryTracker
from torch.distributed.fsdp import MixedPrecisionPolicy, fully_shard
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling
from transformers.models.qwen2.modeling_qwen2 import Qwen2DecoderLayer

from accelerate import Accelerator, FullyShardedDataParallelPlugin
from accelerate.state import AcceleratorState, is_initialized
from accelerate.utils import convert_outputs_to_fp32, set_seed


SEED = 421


def get_named_parameters(model: torch.nn.Module, drop_refs: bool = False) -> dict[str, Union[torch.Tensor, int]]:
    """
    This function returns a dictionary mapping the parameter names to their data pointers or
    the original parameters if `drop_refs` is `False`.
    It is used to get the original parameter names before `fully_shard` is applied.

    We only return the data pointers, so we drop the references to the original parameters
    and `fully_shard` will then trigger a new allocation for the sharded ones.

    Args:
        model (`torch.nn.Module`): Model instance to get the named parameters from
        drop_refs (`bool`, *optional*, defaults to `False`): Whether to drop the references to the original parameters

    Returns:
        `dict[str, Union[torch.Tensor, int]]`: Dictionary mapping the parameter names to their data pointers or the original parameters if `drop_refs` is `False`
    """
    named_parameters = {}
    for n, p in model.named_parameters():
        # We only preserve the data pointers to have the unique 1:1 mapping between the original and the sharded parameters
        named_parameters[n] = p.data_ptr() if drop_refs else p
    return named_parameters


def replace_optimizer_params(optimizer: torch.optim.Optimizer):
    """
    This function is called before using `fully_shard` on the model. It replaces the parameters of the optimizer with
    empty tensors, so `fully_shard` can trigger a new allocation for the sharded ones. After this, we swap the parameters
    `data_ptr` to the original one, so we can reuse that later to map the sharded parameters to the original ones.
    This function modifies the optimizer in-place.

    Args:
        optimizer (torch.optim.Optimizer): Optimizer instance which contains the original model parameters
    """

    for param_group in optimizer.param_groups:
        for i, p in enumerate(param_group["params"]):
            # We drop a reference to the original param here, so that _move_states_to_device triggers a reallocation
            # This is required or else the `fully_shard` -> `_move_states_to_device` uses the original memory address
            # for the sharded parameters, and we get a weird/undefined behavior.
            param_group["params"][i] = torch.empty_like(p)

            # We save the original data_ptr, so we can swap back the parameters later
            param_group["params"][i].data_ptr = p.data_ptr()


def swap_back_optimizer_params(
    model: torch.nn.Module, optimizer: torch.optim.Optimizer, old_named_parameter_pointers: dict[str, int]
):
    """
    This function is the counterpart of `replace_optimizer_params`. It is called after `fully_shard` being applied to
    the model. It swaps the parameters of the optimizer to their sharded counterparts.
    It is done using the `data_ptr` mapping prepared in `replace_optimizer_params` and `get_named_parameters`.

    Args:
        model (`torch.nn.Module`): Model instance to get the new named parameters from
        optimizer (`torch.optim.Optimizer`): Optimizer instance to swap the parameters of
        old_named_parameter_pointers (`dict[str, int]`): Dictionary mapping the original parameter names: data_ptrs to the new ones
    """
    # We get the new named parameters after `fully_shard` being applied
    # We don't drop the references as we need the sharded parameters now
    new_named_parameters = get_named_parameters(model, drop_refs=False)

    # We create a mapping from the original data_ptr to the new sharded param corresponding to it
    mapping = {p: new_named_parameters[n] for n, p in old_named_parameter_pointers.items()}

    for param_group in optimizer.param_groups:
        # We swap the parameters of the optimizer to the new sharded ones
        param_group["params"] = [mapping[p.data_ptr] for p in param_group["params"]]


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--output_dir",
        type=str,
        help="Directory to save the benchmarking results.",
    )
    parser.add_argument(
        "--save_memory_snapshot",
        action="store_true",
        default=False,
        help="If True, `torch.cuda.memory._dump_snapshot` will be used to additionaly save the memory trace.",
    )
    ######################
    # Training arguments #
    ######################
    parser.add_argument(
        "--batch_size",
        type=int,
        default=2,
        help="Batch size for the training loop.",
    )
    parser.add_argument(
        "--block_size",
        type=int,
        default=128,
        help="The maximum sequence length to use with the model.",
    )
    parser.add_argument(
        "--dataset_fraction",
        type=float,
        default=1.0,
        help="Fraction of the dataset to use.",
    )
    return parser.parse_args()


def prepare_dataloader(tokenizer, args, accelerator: Accelerator) -> DataLoader:
    dataset = load_dataset("tiny_shakespeare", split="train", trust_remote_code=True)

    def tokenize_function(example):
        return tokenizer(
            example["text"],
        )

    dataset = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=["text"],
    )

    block_size = min(tokenizer.model_max_length, args.block_size)

    def group_texts(examples):
        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])

        total_length = (total_length // block_size) * block_size

        result = {
            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
            for k, t in concatenated_examples.items()
        }

        result["labels"] = result["input_ids"].copy()
        return result

    dataset = dataset.map(group_texts, batched=True)
    dataset = dataset.select(range(int(len(dataset) * args.dataset_fraction)))

    def collate_fn(examples):
        return DataCollatorForLanguageModeling(
            tokenizer=tokenizer,
            mlm=False,
        )(examples)

    dataloader = DataLoader(
        dataset,
        batch_size=args.batch_size,
        collate_fn=collate_fn,
    )
    dataloader = accelerator.prepare(dataloader)
    return dataloader


def get_model(model_name: str):
    # We reguire model to be loaded in fp32, otherwise benchmarks don't match as accelerate does upcasting of parameters to fp32
    config = AutoConfig.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float32)
    model = AutoModelForCausalLM.from_config(config)
    return model


def get_tokenizer(model_name: str):
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    return tokenizer


def prepare_torch(
    args, config: dict, post_shard_optimizer: bool = False, apply_optimizer_fix: bool = False
) -> tuple[torch.nn.Module, torch.optim.Optimizer, torch.utils.data.DataLoader, Accelerator]:
    mp_policy = MixedPrecisionPolicy(
        param_dtype=torch.bfloat16,
        reduce_dtype=torch.bfloat16,
        output_dtype=torch.bfloat16,
    )

    accelerator = Accelerator(mixed_precision="bf16")
    set_seed(SEED)
    is_fixed = "fixed" if apply_optimizer_fix else "not_fixed"
    is_post_shard = "optimizer_after_fsdp" if post_shard_optimizer else "optimizer_before_fsdp"
    run_name = f"torch_{is_post_shard}" if post_shard_optimizer else f"torch_{is_post_shard}_{is_fixed}"

    tokenizer = get_tokenizer(config["model_name"])
    train_dataloader = prepare_dataloader(tokenizer, args, accelerator)

    memory_tracker = MemoryTracker(accelerator.device, args.output_dir, run_name, args.save_memory_snapshot)
    memory_tracker.start()

    model = get_model(config["model_name"])
    optimizer = None

    if not post_shard_optimizer:
        optimizer = AdamW(model.parameters(), lr=config["learning_rate"])

        if apply_optimizer_fix:
            # We drop the references to the original parameters, so that `fully_shard` can trigger a new allocation
            # Then we get the `module_name: data_ptr` mapping, so we can swap back the parameters later
            old_named_parameters = get_named_parameters(model, drop_refs=True)

            # We replace the parameters of the optimizer with empty tensors, so that `fully_shard` can trigger a new allocation
            # We also change the `data_ptr` of the parameters to the original ones, so we can swap back the parameters later
            replace_optimizer_params(optimizer)

    for module in model.modules():
        if isinstance(module, Qwen2DecoderLayer):
            fully_shard(module, mp_policy=mp_policy)
    fully_shard(model, mp_policy=mp_policy)

    # We do this to imitate how accelerate forces outputs to be in fp32 via `convert_outputs_to_fp32`
    autocast_context = torch.autocast(device_type=accelerator.state.device.type, dtype=torch.bfloat16)
    model_forward_func = model.forward.__func__
    new_forward = autocast_context(model_forward_func)
    model.forward = MethodType(new_forward, model)
    model.forward = MethodType(convert_outputs_to_fp32(model.forward.__func__), model)

    if post_shard_optimizer:
        optimizer = AdamW(model.parameters(), lr=config["learning_rate"])

    if not post_shard_optimizer and apply_optimizer_fix:
        # We swap back the parameters of the optimizer to the original ones
        swap_back_optimizer_params(model, optimizer, old_named_parameters)

    return model, optimizer, train_dataloader, accelerator, memory_tracker


def prepare_accelerate(
    args, config: dict
) -> tuple[torch.nn.Module, torch.optim.Optimizer, torch.utils.data.DataLoader, Accelerator]:
    if is_initialized():
        AcceleratorState()._reset_state(True)

    fsdp_plugin = FullyShardedDataParallelPlugin(
        fsdp_version=2,
        auto_wrap_policy="transformer_based_wrap",
        transformer_cls_names_to_wrap=["Qwen2DecoderLayer"],
    )
    accelerator = Accelerator(
        fsdp_plugin=fsdp_plugin,
        mixed_precision="bf16",
    )
    set_seed(SEED)

    tokenizer = get_tokenizer(config["model_name"])
    train_dataloader = prepare_dataloader(tokenizer, args, accelerator)

    memory_tracker = MemoryTracker(accelerator.device, args.output_dir, "accelerate", args.save_memory_snapshot)
    memory_tracker.start()

    model = get_model(config["model_name"])
    optimizer = AdamW(model.parameters(), lr=config["learning_rate"])

    model, optimizer = accelerator.prepare(model, optimizer)

    return model, optimizer, train_dataloader, accelerator, memory_tracker


================================================
FILE: benchmarks/fsdp2/visualize.py
================================================
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import json

import matplotlib.pyplot as plt


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--dir", type=str, help="Directory containing the memory usage data")
    parser.add_argument(
        "--memory_threshold",
        type=int,
        default=0,
        help="Memory threshold to filter data that is below this value (only filters 1st `--filter_partition` of the points which should roughtly correspond to the model loading)",
    )
    parser.add_argument(
        "--filter_partition",
        type=float,
        default=1 / 3,
        help="Partition to drop data from that are below the memory threshold",
    )
    return parser.parse_args()


def filter_data(data, memory_threshold, filter_partition, key):
    timestamps = data["timestamps"]
    memory = data[key]

    mid_point = int(len(timestamps) * filter_partition)
    filtered_times = []
    filtered_memory = []
    for i, (t, m) in enumerate(zip(timestamps, memory)):
        if i < mid_point and m < memory_threshold:
            continue
        filtered_times.append(t)
        filtered_memory.append(m)
    return filtered_times, filtered_memory


def compare_memory_usage(data, labels, memory_threshold, filter_partition):
    plt.style.use("seaborn-v0_8")
    colors = ["#2ecc71", "#e74c3c", "#3498db", "#f1c40f"]

    fig1, ax1 = plt.subplots(figsize=(15, 5))
    for data_item, label, color in zip(data, labels, colors):
        timestamps, allocated = filter_data(data_item, memory_threshold, filter_partition, "allocated_memory")
        ax1.plot(timestamps, allocated, label=label, color=color, linewidth=2)

    ax1.set_xlabel("Time (s)", fontsize=12)
    ax1.set_ylabel("Allocated Memory (GB)", fontsize=12)
    ax1.set_title("Allocated Memory Usage Over Time", fontsize=14, pad=15)
    ax1.grid(True, linestyle="--", alpha=0.7)
    ax1.legend(frameon=True, fancybox=True, shadow=True, fontsize=10)
    ax1.spines["top"].set_visible(False)
    ax1.spines["right"].set_visible(False)
    plt.tight_layout()

    fig2, ax2 = plt.subplots(figsize=(15, 5))
    for data_item, label, color in zip(data, labels, colors):
        timestamps, reserved = filter_data(data_item, memory_threshold, filter_partition, "reserved_memory")
        ax2.plot(timestamps, reserved, label=label, color=color, linewidth=2)

    ax2.set_xlabel("Time (s)", fontsize=12)
    ax2.set_ylabel("Reserved Memory (GB)", fontsize=12)
    ax2.set_title("Reserved Memory Usage Over Time", fontsize=14, pad=15)
    ax2.grid(True, linestyle="--", alpha=0.7)
    ax2.legend(frameon=True, fancybox=True, shadow=True, fontsize=10)
    ax2.spines["top"].set_visible(False)
    ax2.spines["right"].set_visible(False)
    plt.tight_layout()

    return fig1, fig2


if __name__ == "__main__":
    args = parse_args()
    DIR = args.dir
    with open(f"{DIR}/torch_optimizer_before_fsdp_not_fixed_memory_usage.json") as f:
        optimizer_before_fsdp_not_fixed = json.load(f)

    with open(f"{DIR}/torch_optimizer_after_fsdp_memory_usage.json") as f:
        optimizer_after_fsdp = json.load(f)

    with open(f"{DIR}/torch_optimizer_before_fsdp_fixed_memory_usage.json") as f:
        optimizer_before_fsdp_fixed = json.load(f)

    with open(f"{DIR}/accelerate_memory_usage.json") as f:
        accelerate = json.load(f)

    data = [optimizer_before_fsdp_not_fixed, optimizer_before_fsdp_fixed, optimizer_after_fsdp, accelerate]
    labels = [
        "Optimizer Before FSDP (w/o fix)",
        "Optimizer Before FSDP (w/ fix)",
        "Optimizer After FSDP",
        "Accelerate",
    ]

    fig1, fig2 = compare_memory_usage(data, labels, args.memory_threshold, args.filter_partition)
    fig1.savefig(f"{DIR}/allocated_memory.png")
    fig2.savefig(f"{DIR}/reserved_memory.png")


================================================
FILE: benchmarks/torch.compile/README.md
================================================
# Regional Compilation Benchmark

This benchmark compares different compilation strategies using PyTorch's `torch.compile` and Accelerate's `compile_regions` utility, which is based on the recipe in [PyTorch documentation](https://pytorch.org/tutorials/recipes/regional_compilation.html).

## Overview

The benchmark evaluates three approaches:

- **Baseline**: No compilation, standard PyTorch eager execution.
- **Full compilation**: Using PyTorch's `torch.compile()` on the entire model.
- **Regional compilation**: Using `accelerate.utils.compile_regions()` which targets specific blocks of the model to optimize compilation time.

Each approach is tested with different batch sizes (1 and 4) and sequence lengths (128) on various LLaMA-based models ranging from 1B to 13B parameters. We purposefully run the forward pass outside of the `torch.no_grad()` context to simulate performance in a training environment, where gradients are needed.

## Usage

To run this benchmark:

```bash
python regional_compilation.py
```

The script will automatically download the model configurations, create models, and benchmark both compilation and inference times across different scenarios.

## Requirements

- Suitable GPU memory for the models being tested.
- PyTorch with CUDA support.
- Transformers library.
- Accelerate library.

## Results

The benchmark results are summarized in the following figures:

- Compilation time is how long it takes to run the first forward pass.
- Speedup factor is the ratio of non-compiled baseline inference time to the fully/regionally compiled inference time.

<p align="center">
  <img src="imgs/compilation_time.png" width="80%" alt="Compilation Time">
</p>
<p align="center">
  <img src="imgs/speedup_factor.png" width="80%" alt="Speedup Factor">
</p>

Full results are available in the tables below:

```markdown
[-------------------------------------------------- NousResearch/Llama-3.2-1B ---------------------------------------------------]
                            |  Inference time (1x128)  |  Inference time (4x128)  |  Compile time (1x128)  |  Compile time (4x128)
1 threads: -----------------------------------------------------------------------------------------------------------------------
      Baseline              |           18.3           |           18.4           |                        |                      
      Full compilation      |            6.3           |           10.0           |        10696.4         |        10248.0       
      Regional compilation  |            9.7           |           10.0           |         1952.7         |         2903.9       

Times are in milliseconds (ms).

[---------------------------------------------- NousResearch/Hermes-3-Llama-3.2-3B ----------------------------------------------]
                            |  Inference time (1x128)  |  Inference time (4x128)  |  Compile time (1x128)  |  Compile time (4x128)
1 threads: -----------------------------------------------------------------------------------------------------------------------
      Baseline              |           33.4           |           33.6           |                        |                      
      Full compilation      |           11.2           |           23.9           |        17857.5         |        17736.5       
      Regional compilation  |           17.3           |           23.7           |         2993.2         |         2478.8       

Times are in milliseconds (ms).

[---------------------------------------------- NousResearch/Hermes-3-Llama-3.1-8B ----------------------------------------------]
                            |  Inference time (1x128)  |  Inference time (4x128)  |  Compile time (1x128)  |  Compile time (4x128)
1 threads: -----------------------------------------------------------------------------------------------------------------------
      Baseline              |           40.3           |           59.5           |                        |                      
      Full compilation      |           18.9           |           54.4           |        20437.8         |        20152.3       
      Regional compilation  |           19.7           |           54.0           |         2903.1         |         2438.0       

Times are in milliseconds (ms).

[--------------------------------------------- NousResearch/Nous-Hermes-Llama2-13b ----------------------------------------------]
                            |  Inference time (1x128)  |  Inference time (4x128)  |  Compile time (1x128)  |  Compile time (4x128)
1 threads: -----------------------------------------------------------------------------------------------------------------------
      Baseline              |           45.5           |          100.4           |                        |                      
      Full compilation      |           29.4           |           89.7           |        23099.4         |        22885.9       
      Regional compilation  |           29.4           |           87.5           |         2945.5         |         2526.2       

Times are in milliseconds (ms).
```

## Results Summary

### Compilation Time

Regional compilation provides significantly faster compilation times compared to full model compilation:

- **Full compilation**: Takes ~10-23 seconds depending on model size.
- **Regional compilation**: Takes only ~2-3 seconds across all model sizes.
- **Speed improvement**: Regional compilation is **5-9x faster** to compile.

### Inference Time

Regional compilation delivers inference performance close to full compilation:

- For batch size 1:
  - For smaller models (1B-3B): Full compilation has a slight edge over regional compilation.
  - For larger models (8B-13B): Regional compilation performs similarly to full compilation.
- For batch size 4: Regional compilation performs similarly to full compilation across all models.

## Key Takeaways

1. **Comparable Performance**: Regional compilation delivers performance speedups similar to full compilation, especially for larger models.
2. **Faster Compilation**: Regional compilation significantly reduces the time taken to compile models, making it a more efficient choice for deployment.
3. **Batch Size Impact**: At batch size 4, full compilation and regional compilation perform nearly identically.
4. **Model Size Impact**: Even with a small batch size, full compilation and regional compilation perform similarly for larger models (8B-13B).
5. **Practical Application**: For real-world applications, regional compilation is a practical choice for optimizing training cold start times, especially when working with large models.


================================================
FILE: benchmarks/torch.compile/regional_compilation.py
================================================
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
from torch.utils.benchmark import Compare, Timer
from transformers import AutoConfig, AutoModelForCausalLM

from accelerate.test_utils.testing import get_backend
from accelerate.utils import compile_regions


torch.set_float32_matmul_precision("high")

COMPILE_ITERS = 2
INFERENCE_ITERS = 100

BASELINE = "Baseline"
COMPILE_TIME = "Compile time"
INFRENCE_TIME = "Inference time"
FULL_COMPILATION = "Full compilation"
REGIONAL_COMPILATION = "Regional compilation"

INFRENCE_STMT = "model(input_ids, use_cache=False)"
COMPILE_STMT = f"torch._dynamo.reset(); torch._inductor.utils.clear_inductor_caches(); {INFRENCE_STMT}"

torch_device_type, _, _ = get_backend()

results = []
for model_id in [
    # non-gated llama models
    "NousResearch/Llama-3.2-1B",
    "NousResearch/Hermes-3-Llama-3.2-3B",
    "NousResearch/Hermes-3-Llama-3.1-8B",
    "NousResearch/Nous-Hermes-Llama2-13b",
]:
    with torch.device(torch_device_type):
        config = AutoConfig.from_pretrained(model_id)
        model = AutoModelForCausalLM.from_config(config).to(dtype=torch.float16).eval()

    full_compilation_model = torch.compile(model)
    regional_compilation_model = compile_regions(model)

    for model, sub_label, description, stmt, iters in [
        (model, BASELINE, INFRENCE_TIME, INFRENCE_STMT, INFERENCE_ITERS),
        (full_compilation_model, FULL_COMPILATION, COMPILE_TIME, COMPILE_STMT, COMPILE_ITERS),
        (full_compilation_model, FULL_COMPILATION, INFRENCE_TIME, INFRENCE_STMT, INFERENCE_ITERS),
        (regional_compilation_model, REGIONAL_COMPILATION, COMPILE_TIME, COMPILE_STMT, COMPILE_ITERS),
        (regional_compilation_model, REGIONAL_COMPILATION, INFRENCE_TIME, INFRENCE_STMT, INFERENCE_ITERS),
    ]:
        for batch_size, sequence_length in [(1, 128), (4, 128)]:
            input_ids = torch.randint(
                0, 1000, size=(batch_size, sequence_length), dtype=torch.int64, device=torch_device_type
            )
            results.append(
                Timer(
                    label=model_id,
                    sub_label=sub_label,
                    description=f"{description} ({batch_size}x{sequence_length})",
                    globals={"model": model, "input_ids": input_ids},
                    stmt=stmt,
                ).timeit(number=iters)
            )

compare = Compare(results)
compare.colorize()
compare.print()


================================================
FILE: docker/README.md
================================================
<!---
Copyright 2024 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

# Official Hugging Face Accelerate Docker Images

Accelerate publishes a variety of docker versions as part of our CI that users can also use. These are stable images that Accelerate can run off of which comes with a variety of different setup configurations, all of which are officially hosted on [Docker Hub](https://hub.docker.com/r/huggingface/accelerate).

A breakdown of each are given below

## Naming Conventions

Accelerate docker images follow a tagging convention of:

```bash
huggingface/accelerate:{accelerator}-{nightly,release}
```

`accelerator` in this instance is one of many applical pre-configured backend supports:
* `gpu`: Comes compiled off of the `nvidia/cuda` image and includes core parts like `bitsandbytes`. Runs off python 3.9.
* `cpu`: Comes compiled off of `python:3.9-slim` and is designed for non-CUDA based workloads.
* More to come soon
* `gpu-deepspeed`: Comes compiled off of the `nvidia/cuda` image and includes core parts like `bitsandbytes` as well as the latest `deepspeed` version. Runs off python 3.10.
* `gpu-fp8-transformerengine`: Comes compiled off of `nvcr.io/nvidia/pytorch` and is specifically for running the `benchmarks/fp8` scripts on devices which support FP8 operations using the `TransformerEngine` library (RTX 4090, H100, etc)

## Nightlies vs Releases

Each release a new build is pushed with a version number included in the name. For a GPU-supported image of version 0.28.0 for instance, it would look like the following:

```bash
huggingface/accelerate:gpu-release-0.28.0
```

Nightlies contain two different image tags. There is a general `nightly` tag which is built each night, and a `nightly-YYYY-MM-DD` which corresponds to a build from a particular date.

For instance, here is an example nightly CPU image from 3/14/2024

```bash
huggingface/accelerate:cpu-nightly-2024-03-14
```

## Running the images

Each image comes compiled with `conda` and an `accelerate` environment contains all of the installed dependencies. 

To pull down the latest nightly run:

```bash
docker pull huggingface/accelerate:gpu-nightly
```

To then run it in interactive mode with GPU-memory available, run:

```bash
docker container run --gpus all -it huggingface/accelerate:gpu-nightly
```

## DEPRECATED IMAGES

CPU and GPU docker images were hosted at `huggingface/accelerate-gpu` and `huggingface/accelerate-cpu`. These builds are now outdated and will not receive updates. 

The builds at the corresponding `huggingface/accelerate:{gpu,cpu}` contain the same `Dockerfile`, so it's as simple as changing the docker image to the desired ones from above. We will not be deleting these images for posterity, but they will not be receiving updates going forward.

================================================
FILE: docker/accelerate-cpu/Dockerfile
================================================
# Builds CPU-only Docker image of PyTorch
# Uses multi-staged approach to reduce size
# Stage 1
FROM python:3.10-slim as compile-image

ARG DEBIAN_FRONTEND=noninteractive

RUN apt update
RUN apt-get install -y --no-install-recommends \
    build-essential \
    git \
    gcc

# Setup virtual environment for Docker
ENV VIRTUAL_ENV=/opt/venv
RUN python3 -m venv ${VIRTUAL_ENV}
# Make sure we use the virtualenv
ENV PATH="${VIRTUAL_ENV}/bin:$PATH"
WORKDIR /workspace
# Install specific CPU torch wheel to save on space
RUN python3 -m pip install --upgrade --no-cache-dir pip
RUN python3 -m pip install --no-cache-dir \
    jupyter \
    git+https://github.com/huggingface/accelerate#egg=accelerate[testing,test_trackers] \
    --extra-index-url https://download.pytorch.org/whl/cpu
    
# Stage 2
FROM python:3.10-slim AS build-image
COPY --from=compile-image /opt/venv /opt/venv
RUN useradd -ms /bin/bash user
USER user

# Make sure we use the virtualenv
ENV PATH="/opt/venv/bin:$PATH"
CMD ["/bin/bash"]

================================================
FILE: docker/accelerate-gpu/Dockerfile
================================================
# Builds GPU docker image of PyTorch specifically
# Uses multi-staged approach to reduce size
# Stage 1
# Use base conda image to reduce time
FROM continuumio/miniconda3:latest AS compile-image
# Specify py version
ENV PYTHON_VERSION=3.10
# Install apt libs
RUN apt-get update && \
    apt-get install -y curl git wget && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists*

# Create our conda env
RUN conda create --name accelerate python=${PYTHON_VERSION} ipython jupyter pip
# We don't install pytorch here yet since CUDA isn't available
# instead we use the direct torch wheel
ENV PATH /opt/conda/envs/accelerate/bin:$PATH
# Activate our bash shell
RUN chsh -s /bin/bash
SHELL ["/bin/bash", "-c"]
# Activate the conda env, install mpy4pi, and install torch + accelerate
RUN source activate accelerate && conda install -c conda-forge mpi4py
RUN source activate accelerate && \
    python3 -m pip install --no-cache-dir \
    git+https://github.com/huggingface/accelerate#egg=accelerate[testing,test_trackers] \
    --extra-index-url https://download.pytorch.org/whl/cu126

RUN python3 -m pip install --no-cache-dir bitsandbytes

# Stage 2
FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu22.04  AS build-image
COPY --from=compile-image /opt/conda /opt/conda
ENV PATH /opt/conda/bin:$PATH

# Install apt libs
RUN apt-get update && \
    apt-get install -y curl git wget && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists*

RUN echo "source activate accelerate" >> ~/.profile

# Activate the virtualenv
CMD ["/bin/bash"]

================================================
FILE: docker/accelerate-gpu-deepspeed/Dockerfile
================================================
# Builds GPU docker image of PyTorch specifically
# Uses multi-staged approach to reduce size
# Stage 1
# Use base conda image to reduce time
FROM continuumio/miniconda3:latest AS compile-image
# Specify py version
ENV PYTHON_VERSION=3.10
# Install apt libs
RUN apt-get update && \
    apt-get install -y curl git wget && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists*

# Create our conda env
RUN conda create --name accelerate python=${PYTHON_VERSION} ipython jupyter pip
# We don't install pytorch here yet since CUDA isn't available
# instead we use the direct torch wheel
ENV PATH /opt/conda/envs/accelerate/bin:$PATH
# Activate our bash shell
RUN chsh -s /bin/bash
SHELL ["/bin/bash", "-c"]
# Activate the conda env, install mpy4pi, and install torch + accelerate
RUN source activate accelerate && conda install -c conda-forge mpi4py
RUN source activate accelerate && \
    python3 -m pip install --no-cache-dir \
    git+https://github.com/huggingface/accelerate#egg=accelerate[testing,test_trackers,deepspeed] \
    --extra-index-url https://download.pytorch.org/whl/cu126

RUN python3 -m pip install --no-cache-dir bitsandbytes

# Stage 2
FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu22.04 AS build-image
COPY --from=compile-image /opt/conda /opt/conda
ENV PATH /opt/conda/bin:$PATH

# Install apt libs
RUN apt-get update && \
    apt-get install -y curl git wget && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists*

RUN echo "source activate accelerate" >> ~/.profile

# Activate the virtualenv
CMD ["/bin/bash"]

================================================
FILE: docs/Makefile
================================================
# Minimal makefile for Sphinx documentation
#

# You can set these variables from the command line.
SPHINXOPTS    =
SPHINXBUILD   = sphinx-build
SOURCEDIR     = source
BUILDDIR      = _build

# Put it first so that "make" without argument is like "make help".
help:
	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

.PHONY: help Makefile

# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

================================================
FILE: docs/README.md
================================================
<!---
Copyright 2023 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

# Generating the documentation

To generate the documentation, you first have to build it. Several packages are necessary to build the doc, 
you can install them with the following command, at the root of the code repository:

```bash
pip install -e ".[docs]"
```

Then you need to install our special tool that builds the documentation:

```bash
pip install git+https://github.com/huggingface/doc-builder
```

---
**NOTE**

You only need to generate the documentation to inspect it locally (if you're planning changes and want to
check how they look before committing for instance). You don't have to commit the built documentation.

---

## Building the documentation

Once you have setup the `doc-builder` and additional packages, you can generate the documentation by 
typing the following command:

```bash
doc-builder build accelerate docs/source/ --build_dir ~/tmp/test-build
```

You can adapt the `--build_dir` to set any temporary folder that you prefer. This command will create it and generate
the MDX files that will be rendered as the documentation on the main website. You can inspect them in your favorite
Markdown editor.

## Previewing the documentation

To preview the docs, first install the `watchdog` module with:

```bash
pip install watchdog
```

Then run the following command:

```bash
doc-builder preview {package_name} {path_to_docs}
```

For example:

```bash
doc-builder preview accelerate docs/source/
```

The docs will be viewable at [http://localhost:3000](http://localhost:3000). You can also preview the docs once you have opened a PR. You will see a bot add a comment to a link where the documentation with your changes lives.

---
**NOTE**

The `preview` command only works with existing doc files. When you add a completely new file, you need to update `_toctree.yml` & restart `preview` command (`ctrl-c` to stop it & call `doc-builder preview ...` again).

---

## Adding a new element to the navigation bar

Accepted files are Markdown (.md).

Create a file with its extension and put it in the source directory. You can then link it to the toc-tree by putting
the filename without the extension in the [`_toctree.yml`](https://github.com/huggingface/accelerate/blob/main/docs/source/_toctree.yml) file.

## Renaming section headers and moving sections

It helps to keep the old links working when renaming the section header and/or moving sections from one document to another. This is because the old links are likely to be used in Issues, Forums, and Social media and it'd make for a much more superior user experience if users reading those months later could still easily navigate to the originally intended information.

Therefore, we simply keep a little map of moved sections at the end of the document where the original section was. The key is to preserve the original anchor.

So if you renamed a section from: "Section A" to "Section B", then you can add at the end of the file:

```
Sections that were moved:

[ <a href="#section-b">Section A</a><a id="section-a"></a> ]
```
and of course, if you moved it to another file, then:

```
Sections that were moved:

[ <a href="../new-file#section-b">Section A</a><a id="section-a"></a> ]
```

Use the relative style to link to the new file so that the versioned docs continue to work.


## Writing Documentation - Specification

The `huggingface/accelerate` documentation follows the
[Google documentation](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) style for docstrings,
although we can write them directly in Markdown.

### Adding a new tutorial

Adding a new tutorial or section is done in two steps:

- Add a new file under `./source`. This file can either be ReStructuredText (.rst) or Markdown (.md).
- Link that file in `./source/_toctree.yml` on the correct toc-tree.

Make sure to put your new file under the proper section. It's unlikely to go in the first section (*Get Started*), so
depending on the intended targets (beginners, more advanced users, or researchers) it should go in sections two, three, or
four.

### Writing source documentation

Values that should be put in `code` should either be surrounded by backticks: \`like so\`. Note that argument names
and objects like True, None, or any strings should usually be put in `code`.

When mentioning a class, function, or method, it is recommended to use our syntax for internal links so that our tool
adds a link to its documentation with this syntax: \[\`XXXClass\`\] or \[\`function\`\]. This requires the class or 
function to be in the main package.

If you want to create a link to some internal class or function, you need to
provide its path. For instance: \[\`utils.gather\`\]. This will be converted into a link with
`utils.gather` in the description. To get rid of the path and only keep the name of the object you are
linking to in the description, add a ~: \[\`~utils.gather\`\] will generate a link with `gather` in the description.

The same works for methods so you can either use \[\`XXXClass.method\`\] or \[~\`XXXClass.method\`\].

#### Defining arguments in a method

Arguments should be defined with the `Args:` (or `Arguments:` or `Parameters:`) prefix, followed by a line return and
an indentation. The argument should be followed by its type, with its shape if it is a tensor, a colon, and its
description:

```
    Args:
        n_layers (`int`): The number of layers of the model.
```

If the description is too long to fit in one line (more than 119 characters in total), another indentation is necessary 
before writing the description after the argument.

Finally, to maintain uniformity if any *one* description is too long to fit on one line, the 
rest of the parameters should follow suit and have an indention before their description.

Here's an example showcasing everything so far:

```
    Args:
        gradient_accumulation_steps (`int`, *optional*, default to 1):
            The number of steps that should pass before gradients are accumulated. A number > 1 should be combined with `Accelerator.accumulate`.
        cpu (`bool`, *optional*):
            Whether or not to force the script to execute on CPU. Will ignore GPU available if set to `True` and force the execution on one process only.
```

For optional arguments or arguments with defaults we follow the following syntax: imagine we have a function with the
following signature:

```
def my_function(x: str = None, a: float = 1):
```

then its documentation should look like this:

```
    Args:
        x (`str`, *optional*):
            This argument controls ... and has a description longer than 119 chars.
        a (`float`, *optional*, defaults to 1):
            This argument is used to ... and has a description longer than 119 chars.
```

Note that we always omit the "defaults to \`None\`" when None is the default for any argument. Also note that even
if the first line describing your argument type and its default gets long, you can't break it on several lines. You can
however write as many lines as you want in the indented description (see the example above with `input_ids`).

#### Writing a multi-line code block

Multi-line code blocks can be useful for displaying examples. They are done between two lines of three backticks as usual in Markdown:


````
```python
# first line of code
# second line
# etc
```
````

#### Writing a return block

The return block should be introduced with the `Returns:` prefix, followed by a line return and an indentation.
The first line should be the type of the return, followed by a line return. No need to indent further for the elements
building the return.

Here's an example of a single value return:

```
    Returns:
        `List[int]`: A list of integers in the range [0, 1] --- 1 for a special token, 0 for a sequence token.
```

Here's an example of a tuple return, comprising several objects:

```
    Returns:
        `tuple(torch.FloatTensor)` comprising various elements depending on the configuration ([`BertConfig`]) and inputs:
        - ** loss** (*optional*, returned when `masked_lm_labels` is provided) `torch.FloatTensor` of shape `(1,)` --
          Total loss is the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
        - **prediction_scores** (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`) --
          Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
```

## Styling the docstring

We have an automatic script running with the `make style` comment that will make sure that:
- the docstrings fully take advantage of the line width
- all code examples are formatted using black, like the code of the Transformers library

This script may have some weird failures if you made a syntax mistake or if you uncover a bug. Therefore, it's
recommended to commit your changes before running `make style`, so you can revert the changes done by that script
easily.

## Writing documentation examples

The syntax for Example docstrings can look as follows:

```
    Example:

    ```python
    >>> import time
    >>> from accelerate import Accelerator
    >>> accelerator = Accelerator()
    >>> if accelerator.is_main_process:
    ...     time.sleep(2)
    >>> else:
    ...     print("I'm waiting for the main process to finish its sleep...")
    >>> accelerator.wait_for_everyone()
    >>> # Should print on every process at the same time
    >>> print("Everyone is here")
    ```
```

The docstring should give a minimal, clear example of how the respective function 
is to be used in inference and also include the expected (ideally sensible)
output.
Often, readers will try out the example before even going through the function 
or class definitions. Therefore, it is of utmost importance that the example 
works as expected.

================================================
FILE: docs/source/_toctree.yml
================================================
- sections:
  - local: index
    title: 🤗 Accelerate
  - local: basic_tutorials/install
    title: Installation
  - local: quicktour
    title: Quicktour
  title: Getting started
- sections:
  - local: basic_tutorials/overview
    title: Overview
  - local: basic_tutorials/migration
    title: Add Accelerate to your code
  - local: basic_tutorials/execution
    title: Execution process
  - local: basic_tutorials/tpu
    title: TPU training
  - local: basic_tutorials/launch
    title: Launching Accelerate scripts
  - local: basic_tutorials/notebook
    title: Launching distributed training from Jupyter Notebooks
  title: Tutorials
- sections:
  - isExpanded: true
    sections:
    - local: usage_guides/explore
      title: Start Here!
    - local: usage_guides/model_size_estimator
      title: Model memory estimator
    - local: usage_guides/quantization
      title: Model quantization
    - local: usage_guides/tracking
      title: Experiment trackers
    - local: usage_guides/profiler
      title: Profiler
    - local: usage_guides/checkpoint
      title: Checkpointing
    - local: basic_tutorials/troubleshooting
      title: Troubleshoot
    - local: usage_guides/training_zoo
      title: Example Zoo
    title: Accelerate
  - isExpanded: true
    sections:
    - local: usage_guides/gradient_accumulation
      title: Gradient accumulation
    - local: usage_guides/local_sgd
      title: Local SGD
    - local: usage_guides/low_precision_training
      title: Low precision (FP8) training
    - local: usage_guides/deepspeed
      title: DeepSpeed
    - local: usage_guides/deepspeed_multiple_model
      title: Using multiple models with DeepSpeed
    - local: usage_guides/ddp_comm_hook
      title: DDP Communication Hooks
    - local: usage_guides/fsdp
      title: Fully Sharded Data Parallel
    - local: usage_guides/megatron_lm
      title: Megatron-LM
    - local: usage_guides/sagemaker
      title: Amazon SageMaker
    - local: usage_guides/mps
      title: Apple M1 GPUs
    - local: usage_guides/intel_cpu
      title: Intel CPU
    - local: usage_guides/gaudi
      title: Intel Gaudi
    - local: usage_guides/compilation
      title: Compilation
    title: Training
  - isExpanded: true
    sections:
    - local: usage_guides/big_modeling
      title: Big Model Inference
    - local: usage_guides/distributed_inference
      title: Distributed inference
    title: Inference
  title: How to guides
- sections:
  - local: concept_guides/internal_mechanism
    title: Accelerate's internal mechanism
  - local: concept_guides/big_model_inference
    title: Loading big models into memory
  - local: concept_guides/performance
    title: Comparing performance across distributed setups
  - local: concept_guides/deferring_execution
    title: Executing and deferring jobs
  - local: concept_guides/gradient_synchronization
    title: Gradient synchronization
  - local: concept_guides/fsdp_and_deepspeed
    title: FSDP vs DeepSpeed
  - local: concept_guides/fsdp1_vs_fsdp2
    title: FSDP1 vs FSDP2
  - local: concept_guides/context_parallelism
    title: Context parallelism
  - local: concept_guides/sequence_parallelism
    title: Sequence parallelism
  - local: concept_guides/low_precision_training
    title: Low precision training methods
  - local: concept_guides/training_tpu
    title: Training on TPUs
  title: Concepts and fundamentals
- sections:
  - local: package_reference/accelerator
    title: Accelerator
  - local: package_reference/state
    title: Stateful classes
  - local: package_reference/cli
    title: The Command Line
  - local: package_reference/torch_wrappers
    title: DataLoaders, Optimizers, Schedulers
  - local: package_reference/tracking
    title: Experiment trackers
  - local: package_reference/launchers
    title: Launchers
  - local: package_reference/deepspeed
    title: DeepSpeed utilities
  - local: package_reference/logging
    title: Logging
  - local: package_reference/big_modeling
    title: Working with large models
  - local: package_reference/inference
    title: Pipeline parallelism
  - local: package_reference/kwargs
    title: Kwargs handlers
  - local: package_reference/fp8
    title: FP8
  - local: package_reference/utilities
    title: Utility functions and classes
  - local: package_reference/megatron_lm
    title: Megatron-LM utilities
  - local: package_reference/fsdp
    title: Fully Sharded Data Parallel utilities
  title: "Reference"


================================================
FILE: docs/source/basic_tutorials/execution.md
================================================
<!--Copyright 2024 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Execution process

When working with distributed training systems, it is important to manage how and when processes are executed across GPUs. Some processes are completed faster than others, and some processes shouldn't begin if others haven't finished yet. Accelerate provides tools for orchestrating when processes are executed to ensure everything remains synchronized across all devices.

This tutorial will teach you how to execute a process on only one machine and how to delay execution until all processes have reached a certain point.

## Execute on one process

Certain code only needs to be run once on a given machine, such as printing a log statement or only displaying one progress bar on the local main process.

<hfoptions id="local-execution">
<hfoption id="statements">

You should use `accelerator.is_local_main_process` to indicate code that should only be executed once.

```py
from tqdm.auto import tqdm

progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
```

You could also wrap a statement with `accelerator.is_local_main_process`.

> [!TIP]
> For standalone `print` statements that aren't wrapped in `accelerator.is_local_main_process`, replace `print` with Accelerate's [`~Accelerator.print`] method to only print once per process.

```py
if accelerator.is_local_main_process:
    print("Accelerate is the best")
```

</hfoption>
<hfoption id="function">

For a function that should only be executed once, use [`~Accelerator.on_local_main_process`].

```py
@accelerator.on_local_main_process
def do_my_thing():
    "Something done once per server"
    do_thing_once_per_server()
```

</hfoption>
</hfoptions>

You could also direct Accelerate to execute code once across *all processes* regardless of the number of machines. This is useful if you're uploading a final model to the Hub.

<hfoptions id="main-execution">
<hfoption id="statement">

You should use `accelerator.is_main_process` to indicate code that should only be executed once across all processes.

```py
if accelerator.is_main_process:
    repo.push_to_hub()
```

</hfoption>
<hfoption id="function">

For a function that should only be executed once across all processes, use [`~Accelerator.on_main_process`].

```py
@accelerator.on_main_process
def do_my_thing():
    "Something done once per server"
    do_thing_once()
```

</hfoption>
</hfoptions>

## Execute on a specific process

Accelerate can also help you execute functions that should only be executed on a specific process or a local process index.

<hfoptions id="specific-execution">
<hfoption id="specific process">

Use the [`~Accelerator.on_process`] method and specify the process index to execute a function on.

```py
@accelerator.on_process(process_index=0)
def do_my_thing():
    "Something done on process index 0"
    do_thing_on_index_zero()
```

</hfoption>
<hfoption id="local process">

Use the [`~Accelerator.on_local_process`] method and specify the local process index to execute a function on.

```py
@accelerator.on_local_process(local_process_idx=0)
def do_my_thing():
    "Something done on process index 0 on each server"
    do_thing_on_index_zero_on_each_server()
```

</hfoption>
</hfoptions>

## Defer execution

When you run your script on several GPUs at the same time, some code may be executed faster than others. You might need to wait for all processes to reach a certain point before executing the next set of instructions. For instance, you shouldn’t save a model before making sure every process is done with training.

To do this, add [`~Accelerator.wait_for_everyone`] in your code. This blocks all processes that have finished first from continuing until all remaining processes have reached the same point (this has no effect if you're running on a single GPU or CPU).

```py
accelerator.wait_for_everyone()
```


================================================
FILE: docs/source/basic_tutorials/install.md
================================================
<!--Copyright 2022 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Installation

Before you start, you will need to setup your environment, install the appropriate packages, and configure Accelerate. Accelerate is tested on **Python 3.8+**.

Accelerate is available on pypi and conda, as well as on GitHub. Details to install from each are below:

## pip

To install Accelerate from pypi, perform:

```bash
pip install accelerate
```

## conda

Accelerate can also be installed with conda with:

```bash
conda install -c conda-forge accelerate
```

## Source

New features are added every day that haven't been released yet. To try them out yourself, install
from the GitHub repository:

```bash
pip install git+https://github.com/huggingface/accelerate
```

If you're working on contributing to the library or wish to play with the source code and see live 
results as you run the code, an editable version can be installed from a locally-cloned version of the 
repository:

```bash
git clone https://github.com/huggingface/accelerate
cd accelerate
pip install -e .
```

## Configuration

After installing, you need to configure Accelerate for how the current system is set up for training. 
To do so run the following and answer the questions prompted to you:

```bash
accelerate config
```

To write a barebones configuration that doesn't include options such as DeepSpeed configuration or running on TPUs, you can quickly run:

```bash
python -c "from accelerate.utils import write_basic_config; write_basic_config(mixed_precision='fp16')"
```

Accelerate will automatically utilize the maximum number of GPUs available and set the mixed precision mode.

To check that your configuration looks fine, run:

```bash
accelerate env
```

An example output is shown below, which describes two GPUs on a single machine with no mixed precision being used:


```bash
- `Accelerate` version: 1.2.0.dev0
- Platform: Linux-6.8.0-47-generic-x86_64-with-glibc2.35
- `accelerate` bash location: /home/zach/miniconda3/envs/accelerate/bin/accelerate
- Python version: 3.10.13
- Numpy version: 1.26.4
- PyTorch version (GPU?): 2.5.1+cu124 (True)
- PyTorch XPU available: False
- PyTorch NPU available: False
- PyTorch MLU available: False
- PyTorch MUSA available: False
- System RAM: 187.91 GB
- GPU type: NVIDIA GeForce RTX 4090
- `Accelerate` default config:
        - compute_environment: LOCAL_MACHINE
        - distributed_type: MULTI_GPU
        - mixed_precision: no
        - use_cpu: False
        - debug: False
        - num_processes: 2
        - machine_rank: 0
        - num_machines: 1
        - gpu_ids: all
        - rdzv_backend: static
        - same_network: True
        - main_training_function: main
        - enable_cpu_affinity: False
        - downcast_bf16: no
        - tpu_use_cluster: False
        - tpu_use_sudo: False
        - tpu_env: []
```


================================================
FILE: docs/source/basic_tutorials/launch.md
================================================
<!--Copyright 2022 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Launching Accelerate scripts

In the previous tutorial, you were introduced to how to modify your current training script to use Accelerate.
The final version of that code is shown below:

```python
from accelerate import Accelerator

accelerator = Accelerator()

model, optimizer, training_dataloader, scheduler = accelerator.prepare(
    model, optimizer, training_dataloader, scheduler
)

for batch in training_dataloader:
    optimizer.zero_grad()
    inputs, targets = batch
    outputs = model(inputs)
    loss = loss_function(outputs, targets)
    accelerator.backward(loss)
    optimizer.step()
    scheduler.step()
```

But how do you run this code and have it utilize the special hardware available to it?

First, you should rewrite the above code into a function, and make it callable as a script. For example:

```diff
  from accelerate import Accelerator
  
+ def main():
      accelerator = Accelerator()

      model, optimizer, training_dataloader, scheduler = accelerator.prepare(
          model, optimizer, training_dataloader, scheduler
      )

      for batch in training_dataloader:
          optimizer.zero_grad()
          inputs, targets = batch
          outputs = model(inputs)
          loss = loss_function(outputs, targets)
          accelerator.backward(loss)
          optimizer.step()
          scheduler.step()

+ if __name__ == "__main__":
+     main()
```

Next, you need to launch it with `accelerate launch`. 

<Tip warning={true}>

  It's recommended you run `accelerate config` before using `accelerate launch` to configure your environment to your liking. 
  Otherwise Accelerate will use very basic defaults depending on your system setup.

</Tip>


## Using accelerate launch

Accelerate has a special CLI command to help you launch your code in your system through `accelerate launch`.
This command wraps around all of the different commands needed to launch your script on various platforms, without you having to remember what each of them is.

<Tip>

  If you are familiar with launching scripts in PyTorch yourself such as with `torchrun`, you can still do this. It is not required to use `accelerate launch`.

</Tip>

You can launch your script quickly by using:

```bash
accelerate launch {script_name.py} --arg1 --arg2 ...
```

Just put `accelerate launch` at the start of your command, and pass in additional arguments and parameters to your script afterward like normal!

Since this runs the various torch spawn methods, all of the expected environment variables can be modified here as well.
For example, here is how to use `accelerate launch` with a single GPU:

```bash
# for cuda device:
CUDA_VISIBLE_DEVICES="0" accelerate launch {script_name.py} --arg1 --arg2 ...
# for xpu device:
ZE_AFFINITY_MASK="0" accelerate launch {script_name.py} --arg1 --arg2 ...
```

You can also use `accelerate launch` without performing `accelerate config` first, but you may need to manually pass in the right configuration parameters.
In this case, Accelerate will make some hyperparameter decisions for you, e.g., if GPUs are available, it will use all of them by default without the mixed precision.
Here is how you would use all GPUs and train with mixed precision disabled:

```bash
accelerate launch --multi_gpu {script_name.py} {--arg1} {--arg2} ...
```

Or by specifying a number of GPUs to use:

```bash
accelerate launch --num_processes=2 {script_name.py} {--arg1} {--arg2} ...
```

To get more specific you should pass in the needed parameters yourself. For instance, here is how you 
would also launch that same script on two GPUs using mixed precision while avoiding all of the warnings: 

```bash
accelerate launch --multi_gpu --mixed_precision=fp16 --num_processes=2 {script_name.py} {--arg1} {--arg2} ...
```

For a complete list of parameters you can pass in, run:

```bash
accelerate launch -h
```

<Tip>

  Even if you are not using Accelerate in your code, you can still use the launcher for starting your scripts!

</Tip>

For a visualization of this difference, that earlier `accelerate launch` on multi-gpu would look something like so with `torchrun`:

```bash
MIXED_PRECISION="fp16" torchrun --nproc_per_node=2 --nnodes=1 {script_name.py} {--arg1} {--arg2} ...
```

You can also launch your script utilizing the launch CLI as a python module itself, enabling the ability to pass in other python-specific
launching behaviors. To do so, use `accelerate.commands.launch` instead of `accelerate launch`:

```bash
python -m accelerate.commands.launch --num_processes=2 {script_name.py} {--arg1} {--arg2}
```

If you want to execute the script with any other python flags, you can pass them in as well similar to `-m`, such as 
the below example enabling unbuffered stdout and stderr:

```bash
python -u -m accelerate.commands.launch --num_processes=2 {script_name.py} {--arg1} {--arg2}
```

<Tip>

  You can run your code on CPU as well! This is helpful for debugging and testing purposes on toy models and datasets. 

```bash
accelerate launch --cpu {script_name.py} {--arg1} {--arg2}
```  

</Tip>

## Why you should always use `accelerate config`

Why is it useful to the point you should **always** run `accelerate config`? 

Remember that earlier call to `accelerate launch` as well as `torchrun`?
Post configuration, to run that script with the needed parts you just need to use `accelerate launch` outright, without passing anything else in:

```bash
accelerate launch {script_name.py} {--arg1} {--arg2} ...
```


## Custom Configurations

As briefly mentioned earlier, `accelerate launch` should be mostly used through combining set configurations 
made with the `accelerate config` command. These configs are saved to a `default_config.yaml` file in your cache folder for Accelerate. 
This cache folder is located at (with decreasing order of priority):

- The content of your environment variable `HF_HOME` suffixed with `accelerate`.
- If it does not exist, the content of your environment variable `XDG_CACHE_HOME` suffixed with
  `huggingface/accelerate`.
- If this does not exist either, the folder `~/.cache/huggingface/accelerate`.

To have multiple configurations, the flag `--config_file` can be passed to the `accelerate launch` command paired 
with the location of the custom yaml. 

An example yaml may look something like the following for two GPUs on a single machine using `fp16` for mixed precision:
```yaml
compute_environment: LOCAL_MACHINE
deepspeed_config: {}
distributed_type: MULTI_GPU
fsdp_config: {}
machine_rank: 0
main_process_ip: null
main_process_port: null
main_training_function: main
mixed_precision: fp16
num_machines: 1
num_processes: 2
use_cpu: false
```

Launching a script from the location of that custom yaml file looks like the following:
```bash
accelerate launch --config_file {path/to/config/my_config_file.yaml} {script_name.py} {--arg1} {--arg2} ...
```

## Multi-node training
Multi-node training with Accelerate is similar to [multi-node training with torchrun](https://pytorch.org/tutorials/intermediate/ddp_series_multinode.html). The simplest way to launch a multi-node training run is to do the following:

- Copy your codebase and data to all nodes. (or place them on a shared filesystem)
- Setup your python packages on all nodes.
- Run `accelerate config` on the main single node first. After specifying the number of nodes, you will be asked to specify the rank of each node (this will be 0 for the main/master node), along with the IP address and port for the main process. This is required for the worker nodes to communicate with the main process. Afterwards, you can copy or send this config file across all of your nodes, changing the `machine_rank` to 1, 2,3, etc. to avoid having to run the command (or just follow their directions directly for launching with `torchrun` as well)

Once you have done this, you can start your multi-node training run by running `accelerate launch` (or `torchrun`) on all nodes.

<Tip>
    It is required that the command be run on all nodes for everything to start, not just running it from the main node. You can use something like SLURM or a different process executor to wrap around this requirement and call everything from a single command.
</Tip>

<Tip>

 It is recommended to use the intranet IP of your main node over the public IP for better latency. This is the `192.168.x.x` or the `172.x.x.x` address you see when you run `hostname -I` on the main node.

</Tip>

To get a better idea about multi-node training, check out our example for [multi-node training with FSDP](https://huggingface.co/blog/ram-efficient-pytorch-fsdp).


================================================
FILE: docs/source/basic_tutorials/migration.md
================================================
<!--Copyright 2022 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Add Accelerate to your code

Each distributed training framework has its own way of doing things which can require writing a lot of custom code to adapt it to your PyTorch training code and training environment. Accelerate offers a friendly way to interface with these distributed training frameworks without having to learn the specific details of each one. Accelerate takes care of those details for you, so you can focus on the training code and scale it to any distributed training environment.

In this tutorial, you'll learn how to adapt your existing PyTorch code with Accelerate and get you on your way toward training on distributed systems with ease! You'll start with a basic PyTorch training loop (it assumes all the training objects like `model` and `optimizer` have been set up already) and progressively integrate Accelerate into it.

```python
device = "cuda"
model.to(device)

for batch in training_dataloader:
    optimizer.zero_grad()
    inputs, targets = batch
    inputs = inputs.to(device)
    targets = targets.to(device)
    outputs = model(inputs)
    loss = loss_function(outputs, targets)
    loss.backward()
    optimizer.step()
    scheduler.step()
```

## Accelerator

The [`Accelerator`] is the main class for adapting your code to work with Accelerate. It knows about the distributed setup you're using such as the number of different processes and your hardware type. This class also provides access to many of the necessary methods for enabling your PyTorch code to work in any distributed training environment and for managing and executing processes across devices.

That's why you should always start by importing and creating an [`Accelerator`] instance in your script.

```python
from accelerate import Accelerator

accelerator = Accelerator()
```

The [`Accelerator`] also knows which device to move your PyTorch objects to, so it is recommended to let Accelerate handle this for you.

```diff
- device = "cuda"
+ device = accelerator.device
  model.to(device)
```

## Prepare PyTorch objects

Next, you need to prepare your PyTorch objects (model, optimizer, scheduler, etc.) for distributed training. The [`~Accelerator.prepare`] method takes care of placing your model in the appropriate container (like single GPU or multi-GPU) for your training setup, adapting the optimizer and scheduler to use Accelerate's [`~optimizer.AcceleratedOptimizer`] and [`~scheduler.AcceleratedScheduler`], and creating a new dataloader that can be sharded across processes.

> [!TIP]
> Accelerate only prepares objects that inherit from their respective PyTorch classes such as `torch.optim.Optimizer`.

The PyTorch objects are returned in the same order they're sent.

```py
model, optimizer, training_dataloader, scheduler = accelerator.prepare(
    model, optimizer, training_dataloader, scheduler
)
```

## Training loop

Finally, remove the `to(device)` calls to the inputs and targets in the training loop because Accelerate's DataLoader classes automatically places them on the right device. You should also replace the usual `backward()` pass with Accelerate's [`~Accelerator.backward`] method which scales the gradients for you and uses the appropriate `backward()` method depending on your distributed setup (for example, DeepSpeed or Megatron).

```diff
-   inputs = inputs.to(device)
-   targets = targets.to(device)
    outputs = model(inputs)
    loss = loss_function(outputs, targets)
-   loss.backward()
+   accelerator.backward(loss)
```

Put everything together and your new Accelerate training loop should now look like this!

```python
from accelerate import Accelerator
accelerator = Accelerator()

device = accelerator.device
model, optimizer, training_dataloader, scheduler = accelerator.prepare(
    model, optimizer, training_dataloader, scheduler
)

for batch in training_dataloader:
    optimizer.zero_grad()
    inputs, targets = batch
    outputs = model(inputs)
    loss = loss_function(outputs, targets)
    accelerator.backward(loss)
    optimizer.step()
    scheduler.step()
```

## Training features

Accelerate offers additional features - like gradient accumulation, gradient clipping, mixed precision training and more - you can add to your script to improve your training run. Let's explore these three features.

### Gradient accumulation

Gradient accumulation enables you to train on larger batch sizes by accumulating the gradients over multiple batches before updating the weights. This can be useful for getting around memory limitations. To enable this feature in Accelerate, specify the `gradient_accumulation_steps` parameter in the [`Accelerator`] class and add the [`~Accelerator.accumulate`] context manager to your script.

```diff
+ accelerator = Accelerator(gradient_accumulation_steps=2)
  model, optimizer, training_dataloader = accelerator.prepare(model, optimizer, training_dataloader)

  for input, label in training_dataloader:
+     with accelerator.accumulate(model):
          predictions = model(input)
          loss = loss_function(predictions, label)
          accelerator.backward(loss)
          optimizer.step()
          scheduler.step()
          optimizer.zero_grad()
```

### Gradient clipping

Gradient clipping is a technique to prevent "exploding gradients", and Accelerate offers:

* [`~Accelerator.clip_grad_value_`] to clip gradients to a minimum and maximum value
* [`~Accelerator.clip_grad_norm_`] for normalizing gradients to a certain value

### Mixed precision

Mixed precision accelerates training by using a lower precision data type like fp16 (half-precision) to calculate the gradients. For the best performance with Accelerate, the loss should be computed inside your model (like in Transformers models) because computations outside of the model are computed in full precision.

Set the mixed precision type to use in the [`Accelerator`], and then use the [`~Accelerator.autocast`] context manager to automatically cast the values to the specified data type.

> [!WARNING]
> Accelerate enables automatic mixed precision, so [`~Accelerator.autocast`] is only needed if there are other mixed precision operations besides those performed on loss by [`~Accelerator.backward`] which already handles the scaling.

```diff
+ accelerator = Accelerator(mixed_precision="fp16")
+ with accelerator.autocast():
      loss = complex_loss_function(outputs, target)
```

## Save and load

Accelerate can also save and load a *model* once training is complete or you can also save the model and optimizer *state* which could be useful for resuming training.

### Model

Once all processes are complete, unwrap the model with the [`~Accelerator.unwrap_model`] method before saving it because the [`~Accelerator.prepare`] method wrapped your model into the proper interface for distributed training. If you don't unwrap the model, saving the model state dictionary also saves any potential extra layers from the larger model and you won't be able to load the weights back into your base model.

You should use the [`~Accelerator.save_model`] method to unwrap and save the model state dictionary. This method can also save a model into sharded checkpoints or into the [safetensors](https://hf.co/docs/safetensors/index) format.

<hfoptions id="save">
<hfoption id="single checkpoint">

```py
accelerator.wait_for_everyone()
accelerator.save_model(model, save_directory)
```

<Tip>

For models from the [Transformers](https://hf.co/docs/transformers/index) library, save the model with the [`~transformers.PreTrainedModel.save_pretrained`] method so that it can be reloaded with the [`~transformers.PreTrainedModel.from_pretrained`] method.

```py
from transformers import AutoModel

unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(
    "path/to/my_model_directory",
    is_main_process=accelerator.is_main_process,
    save_function=accelerator.save,
)

model = AutoModel.from_pretrained("path/to/my_model_directory")
```

</Tip>

To load your weights, use the [`~Accelerator.unwrap_model`] method to unwrap the model first before loading the weights. All model parameters are references to tensors, so this loads your weights inside `model`.

```py
unwrapped_model = accelerator.unwrap_model(model)
path_to_checkpoint = os.path.join(save_directory,"pytorch_model.bin")
unwrapped_model.load_state_dict(torch.load(path_to_checkpoint))
```

</hfoption>
<hfoption id="sharded checkpoint">

Set `safe_serialization=True` to save the model in the safetensor format.

```py
accelerator.wait_for_everyone()
accelerator.save_model(model, save_directory, max_shard_size="1GB", safe_serialization=True)
```

To load a sharded checkpoint or a safetensor formatted checkpoint, use the [`~accelerate.load_checkpoint_in_model`] method. This method allows you to load a checkpoint onto a specific device.

```py
load_checkpoint_in_model(unwrapped_model, save_directory, device_map={"":device})
```

</hfoption>
</hfoptions>

### State

During training, you may want to save the current state of the model, optimizer, random generators, and potentially learning rate schedulers so they can be restored in the *same script*. You should add the [`~Accelerator.save_state`] and [`~Accelerator.load_state`] methods to your script to save and load states.

To further customize where and how states are saved through [`~Accelerator.save_state`], use the [`~utils.ProjectConfiguration`] class. For example, if `automatic_checkpoint_naming` is enabled, each saved checkpoint is stored at `Accelerator.project_dir/checkpoints/checkpoint_{checkpoint_number}`.

Any other stateful items to be stored should be registered with the [`~Accelerator.register_for_checkpointing`] method so they can be saved and loaded. Every object passed to this method to be stored must have a `load_state_dict` and `state_dict` function.

> [!TIP]
> If you have [`torchdata>=0.8.0`](https://github.com/pytorch/data/tree/main) installed, you can additionally pass `use_stateful_dataloader=True` into your [`~utils.DataLoaderConfiguration`]. This extends Accelerate's DataLoader classes with a `load_state_dict` and `state_dict` function, and makes it so `Accelerator.save_state` and `Accelerator.load_state` also track how far into the training dataset it has read when persisting the model.


================================================
FILE: docs/source/basic_tutorials/notebook.md
================================================
<!--Copyright 2022 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Launching distributed training from Jupyter Notebooks

This tutorial teaches you how to fine-tune a computer vision model with 🤗 Accelerate from a Jupyter Notebook on a distributed system.
You will also learn how to set up a few requirements needed for ensuring your environment is configured properly, your data has been prepared properly, and finally how to launch training.

<Tip>

    This tutorial is also available as a Jupyter Notebook [here](https://github.com/huggingface/notebooks/blob/main/examples/accelerate_examples/simple_cv_example.ipynb)

</Tip>

## Configuring the Environment

Before any training can be performed, an Accelerate config file must exist in the system. Usually this can be done by running the following in a terminal and answering the prompts:

```bash
accelerate config
```

However, if general defaults are fine and you are *not* running on a TPU, Accelerate has a utility to quickly write your device configuration into a config file via [`utils.write_basic_config`].

The following code will restart Jupyter after writing the configuration, as CUDA runtime or XPU runtime was called to perform this. 

<Tip warning={true}>

    CUDA and XPU can't be initialized more than once on a multi-device system. It's fine to debug in the notebook and have calls to CUDA/XPU, but in order to finally train a full cleanup and restart will need to be performed.
    
</Tip>

```python
import os
from accelerate.utils import write_basic_config

write_basic_config()  # Write a config file
os._exit(00)  # Restart the notebook
```

## Preparing the Dataset and Model

Next you should prepare your dataset. As mentioned earlier, great care should be taken when preparing the `DataLoaders` and model to make sure that **nothing** is put on *any* GPU. 

If you do, it is recommended to put that specific code into a function and call that from within the notebook launcher interface, which will be shown later. 

Make sure the dataset is downloaded based on the directions [here](https://github.com/huggingface/accelerate/tree/main/examples#simple-vision-example)

```python
import os, re, torch, PIL
import numpy as np

from torch.optim.lr_scheduler import OneCycleLR
from torch.utils.data import DataLoader, Dataset
from torchvision.transforms import Compose, RandomResizedCrop, Resize, ToTensor

from accelerate import Accelerator
from accelerate.utils import set_seed
from timm import create_model
```

First you need to create a function to extract the class name based on a filename:

```python
import os

data_dir = "../../images"
fnames = os.listdir(data_dir)
fname = fnames[0]
print(fname)
```

```python out
beagle_32.jpg
```

In the case here, the label is `beagle`. Using regex you can extract the label from the filename:

```python
import re


def extract_label(fname):
    stem = fname.split(os.path.sep)[-1]
    return re.search(r"^(.*)_\d+\.jpg$", stem).groups()[0]
```

```python
extract_label(fname)
```

And you can see it properly returned the right name for our file:

```python out
"beagle"
```

Next a `Dataset` class should be made to handle grabbing the image and the label:

```python
class PetsDataset(Dataset):
    def __init__(self, file_names, image_transform=None, label_to_id=None):
        self.file_names = file_names
        self.image_transform = image_transform
        self.label_to_id = label_to_id

    def __len__(self):
        return len(self.file_names)

    def __getitem__(self, idx):
        fname = self.file_names[idx]
        raw_image = PIL.Image.open(fname)
        image = raw_image.convert("RGB")
        if self.image_transform is not None:
            image = self.image_transform(image)
        label = extract_label(fname)
        if self.label_to_id is not None:
            label = self.label_to_id[label]
        return {"image": image, "label": label}
```

Now to build the dataset. Outside the training function you can find and declare all the filenames and labels and use them as references inside the 
launched function:

```python
fnames = [os.path.join("../../images", fname) for fname in fnames if fname.endswith(".jpg")]
```

Next gather all the labels:

```python
all_labels = [extract_label(fname) for fname in fnames]
id_to_label = list(set(all_labels))
id_to_label.sort()
label_to_id = {lbl: i for i, lbl in enumerate(id_to_label)}
```

Next, you should make a `get_dataloaders` function that will return your built dataloaders for you. As mentioned earlier, if data is automatically 
sent to the GPU or a TPU device when building your `DataLoaders`, they must be built using this method. 

```python
def get_dataloaders(batch_size: int = 64):
    "Builds a set of dataloaders with a batch_size"
    random_perm = np.random.permutation(len(fnames))
    cut = int(0.8 * len(fnames))
    train_split = random_perm[:cut]
    eval_split = random_perm[cut:]

    # For training a simple RandomResizedCrop will be used
    train_tfm = Compose([RandomResizedCrop((224, 224), scale=(0.5, 1.0)), ToTensor()])
    train_dataset = PetsDataset([fnames[i] for i in train_split], image_transform=train_tfm, label_to_id=label_to_id)

    # For evaluation a deterministic Resize will be used
    eval_tfm = Compose([Resize((224, 224)), ToTensor()])
    eval_dataset = PetsDataset([fnames[i] for i in eval_split], image_transform=eval_tfm, label_to_id=label_to_id)

    # Instantiate dataloaders
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, num_workers=4)
    eval_dataloader = DataLoader(eval_dataset, shuffle=False, batch_size=batch_size * 2, num_workers=4)
    return train_dataloader, eval_dataloader
```

Finally, you should import the scheduler to be used later:

```python
from torch.optim.lr_scheduler import CosineAnnealingLR
```

## Writing the Training Function

Now you can build the training loop. [`notebook_launcher`] works by passing in a function to call that will be ran across the distributed system.

Here is a basic training loop for the animal classification problem:

<Tip>

    The code has been split up to allow for explanations on each section. A full version that can be copy and pasted will be available at the end

</Tip>


```python
def training_loop(mixed_precision="fp16", seed: int = 42, batch_size: int = 64):
    set_seed(seed)
    accelerator = Accelerator(mixed_precision=mixed_precision)
```

First you should set the seed and create an [`Accelerator`] object as early in the training loop as possible.

<Tip warning={true}>

    If training on the TPU, your training loop should take in the model as a parameter and it should be instantiated 
    outside of the training loop function. See the [TPU best practices](../concept_guides/training_tpu) 
    to learn why

</Tip>

Next you should build your dataloaders and create your model:

```python
    train_dataloader, eval_dataloader = get_dataloaders(batch_size)
    model = create_model("resnet50d", pretrained=True, num_classes=len(label_to_id))
```

<Tip>

    You build the model here so that the seed also controls the new weight initialization

</Tip>

As you are performing transfer learning in this example, the encoder of the model starts out frozen so the head of the model can be 
trained only initially:

```python
    for param in model.parameters():
        param.requires_grad = False
    for param in model.get_classifier().parameters():
        param.requires_grad = True
```

Normalizing the batches of images will make training a little faster:

```python
    mean = torch.tensor(model.default_cfg["mean"])[None, :, None, None]
    std = torch.tensor(model.default_cfg["std"])[None, :, None, None]
```

To make these constants available on the active device, you should set it to the Accelerator's device:

```python
    mean = mean.to(accelerator.device)
    std = std.to(accelerator.device)
```

Next instantiate the rest of the PyTorch classes used for training:

```python
    optimizer = torch.optim.Adam(params=model.parameters(), lr=3e-2 / 25)
    lr_scheduler = OneCycleLR(optimizer=optimizer, max_lr=3e-2, epochs=5, steps_per_epoch=len(train_dataloader))
```

Before passing everything to [`~Accelerator.prepare`].

<Tip>

    There is no specific order to remember, you just need to unpack the objects in the same order you gave them to the prepare method.

</Tip>

```python
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
    )
```

Now train the model:

```python
    for epoch in range(5):
        model.train()
        for batch in train_dataloader:
            inputs = (batch["image"] - mean) / std
            outputs = model(inputs)
            loss = torch.nn.functional.cross_entropy(outputs, batch["label"])
            accelerator.backward(loss)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
```

The evaluation loop will look slightly different compared to the training loop. The number of elements passed as well as the overall 
total accuracy of each batch will be added to two constants:

```python
        model.eval()
        accurate = 0
        num_elems = 0
```

Next you have the rest of your standard PyTorch loop:

```python
        for batch in eval_dataloader:
            inputs = (batch["image"] - mean) / std
            with torch.no_grad():
                outputs = model(inputs)
            predictions = outputs.argmax(dim=-1)
```

Before finally the last major difference. 

When performing distributed evaluation, the predictions and labels need to be passed through 
[`~Accelerator.gather`] so that all of the data is available on the current device and a properly calculated metric can be achieved:

```python
            accurate_preds = accelerator.gather(predictions) == accelerator.gather(batch["label"])
            num_elems += accurate_preds.shape[0]
            accurate += accurate_preds.long().sum()
```

Now you just need to calculate the actual metric for this problem, and you can print it on the main process using [`~Accelerator.print`]:

```python
        eval_metric = accurate.item() / num_elems
        accelerator.print(f"epoch {epoch}: {100 * eval_metric:.2f}")
```

A full version of this training loop is available below:

```python
def training_loop(mixed_precision="fp16", seed: int = 42, batch_size: int = 64):
    set_seed(seed)
    # Initialize accelerator
    accelerator = Accelerator(mixed_precision=mixed_precision)
    # Build dataloaders
    train_dataloader, eval_dataloader = get_dataloaders(batch_size)

    # Instantiate the model (you build the model here so that the seed also controls new weight initializations)
    model = create_model("resnet50d", pretrained=True, num_classes=len(label_to_id))

    # Freeze the base model
    for param in model.parameters():
        param.requires_grad = False
    for param in model.get_classifier().parameters():
        param.requires_grad = True

    # You can normalize the batches of images to be a bit faster
    mean = torch.tensor(model.default_cfg["mean"])[None, :, None, None]
    std = torch.tensor(model.default_cfg["std"])[None, :, None, None]

    # To make these constants available on the active device, set it to the accelerator device
    mean = mean.to(accelerator.device)
    std = std.to(accelerator.device)

    # Instantiate the optimizer
    optimizer = torch.optim.Adam(params=model.parameters(), lr=3e-2 / 25)

    # Instantiate the learning rate scheduler
    lr_scheduler = OneCycleLR(optimizer=optimizer, max_lr=3e-2, epochs=5, steps_per_epoch=len(train_dataloader))

    # Prepare everything
    # There is no specific order to remember, you just need to unpack the objects in the same order you gave them to the
    # prepare method.
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
    )

    # Now you train the model
    for epoch in range(5):
        model.train()
        for batch in train_dataloader:
            inputs = (batch["image"] - mean) / std
            outputs = model(inputs)
            loss = torch.nn.functional.cross_entropy(outputs, batch["label"])
            accelerator.backward(loss)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        model.eval()
        accurate = 0
        num_elems = 0
        for batch in eval_dataloader:
            inputs = (batch["image"] - mean) / std
            with torch.no_grad():
                outputs = model(inputs)
            predictions = outputs.argmax(dim=-1)
            accurate_preds = accelerator.gather(predictions) == accelerator.gather(batch["label"])
            num_elems += accurate_preds.shape[0]
            accurate += accurate_preds.long().sum()

        eval_metric = accurate.item() / num_elems
        # Use accelerator.print to print only on the main process.
        accelerator.print(f"epoch {epoch}: {100 * eval_metric:.2f}")
```

## Using the notebook_launcher

All that's left is to use the [`notebook_launcher`].

You pass in the function, the arguments (as a tuple), and the number of processes to train on. (See the [documentation](../package_reference/launchers) for more information)

```python
from accelerate import notebook_launcher
```

```python
args = ("fp16", 42, 64)
notebook_launcher(training_loop, args, num_processes=2)
```

In the case of running on multiple nodes, you need to set up a Jupyter session at each node and run the launching cell at the same time.

For an environment containing 2 nodes (computers) with 8 GPUs each and the main computer with an IP address of "172.31.43.8", it would look like so:

```python
notebook_launcher(training_loop, args, master_addr="172.31.43.8", node_rank=0, num_nodes=2, num_processes=8)
```

And in the second Jupyter session on the other machine:

<Tip>

    Notice how the `node_rank` has changed

</Tip>

```python
notebook_launcher(training_loop, args, master_addr="172.31.43.8", node_rank=1, num_nodes=2, num_processes=8)
```

In the case of running on the TPU, it would look like so:

```python
model = create_model("resnet50d", pretrained=True, num_classes=len(label_to_id))

args = (model, "fp16", 42, 64)
notebook_launcher(training_loop, args, num_processes=8)
```

To launch the training process with elasticity, enabling fault tolerance, you can use the `elastic_launch` feature provided by PyTorch. This requires setting additional parameters such as `rdzv_backend` and `max_restarts`. Here is an example of how to use `notebook_launcher` with elastic capabilities:

```python
notebook_launcher(
    training_loop,
    args,
    num_processes=2,
    max_restarts=3
)
```

As it's running it will print the progress as well as state how many devices you ran on. This tutorial was ran with two GPUs:

```python out
Launching training on 2 GPUs.
epoch 0: 88.12
epoch 1: 91.73
epoch 2: 92.58
epoch 3: 93.90
epoch 4: 94.71
```

And that's it!

Please note that [`notebook_launcher`] ignores the Accelerate config file, to launch based on the config use:

```bash
accelerate launch
```

## Debugging 

A common issue when running the `notebook_launcher` is receiving a CUDA/XPU has already been initialized issue. This usually stems
from an import or prior code in the notebook that makes a call to the PyTorch `torch.cuda` or `torch.xpu` sublibrary. To help narrow down what went wrong,
you can launch the `notebook_launcher` with `ACCELERATE_DEBUG_MODE=yes` in your environment and an additional check
will be made when spawning that a regular process can be created and utilize CUDA/XPU without issue. (Your CUDA/XPU code can still be ran afterwards).

## Conclusion

This notebook showed how to perform distributed training from inside of a Jupyter Notebook. Some key notes to remember:

- Make sure to save any code that use CUDA/XPU (or CUDA/XPU imports) for the function passed to [`notebook_launcher`]
- Set the `num_processes` to be the number of devices used for training (such as number of GPUs, XPUs, CPUs, TPUs, etc)
- If using the TPU, declare your model outside the training loop function


================================================
FILE: docs/source/basic_tutorials/overview.md
================================================
<!--Copyright 2022 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Overview

Welcome to the Accelerate tutorials! These introductory guides will help catch you up to speed on working with Accelerate.
You'll learn how to modify your code to have it work with the API seamlessly, how to launch your script properly,
and more!

These tutorials assume some basic knowledge of Python and familiarity with the PyTorch framework.

If you have any questions about Accelerate, feel free to join and ask the community on our [forum](https://discuss.huggingface.co/c/accelerate/18).

================================================
FILE: docs/source/basic_tutorials/tpu.md
================================================
<!--Copyright 2024 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# TPU training

A [TPU (Tensor Processing Unit)](https://cloud.google.com/tpu/docs/intro-to-tpu) is a type of hardware specifically designed for training models efficiently. Accelerate supports TPU training, but there are a few things you should be aware of, namely graph compilation. This tutorial briefly discusses compilation, and for more details, take a look at the [Training on TPUs with Accelerate](../concept_guides/training_tpu) guide.

## Compilation

A TPU creates a graph of all the operations in the training step such as the forward pass, backward pass and optimizer step. This is why the first training step always takes a while because building and compiling this graph takes time. But once compilation is complete, it is cached and all subsequent steps are much faster.

The key is to avoid compiling your code again or else training is super slow. This means all your operations must be exactly the same:

* all tensors in your batches must have the same length (for example, no dynamic padding for NLP tasks)
* your code must be static (for example, no layers with for loops that have different lengths depending on the input such as a LSTM)

## Weight tying

A common language model design is to tie the weights of the embedding and softmax layers. However, moving the model to a TPU (either yourself or passing it to the [`~Accelerator.prepare`] method) breaks the weight tying and you'll need to retie the weights.

To add special behavior (like weight tying) in your script for TPUs, set [`~Accelerator.distributed_type`] to `DistributedType.TPU` first. Then you can use the [`~transformers.PreTrainedModel.tie_weights`] method to tie the weights.

```py
if accelerator.distributed_type == DistributedType.TPU:
    model.tie_weights()
```


================================================
FILE: docs/source/basic_tutorials/troubleshooting.md
================================================
<!--Copyright 2023 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Troubleshoot

This guide provides solutions to some issues you might encounter when using Accelerate. Not all errors are covered because Accelerate is an active library that is continuously evolving and there are many different use cases and distributed training setups. If the solutions described here don't help with your specific error, please take a look at the [Ask for help](#ask-for-help) section to learn where and how to get help.

## Logging

Logging can help you identify where an error is coming from. In a distributed setup with multiple processes, logging can be a challenge, but Accelerate provides the [`~accelerate.logging`] utility to ensure logs are synchronized.

To troubleshoot an issue, use [`~accelerate.logging`] instead of the standard Python [`logging`](https://docs.python.org/3/library/logging.html#module-logging) module. Set the verbosity level (`INFO`, `DEBUG`, `WARNING`, `ERROR`, `CRITICAL`) with the `log_level` parameter, and then you can either:

1. Export the `log_level` as the `ACCELERATE_LOG_LEVEL` environment variable.
2. Pass the `log_level` directly to `get_logger`.

For example, to set `log_level="INFO"`:

```py
from accelerate.logging import get_logger

logger = get_logger(__name__, log_level="DEBUG")
```

By default, the log is called on main processes only. To call it on all processes, pass `main_process_only=False`.
If a log should be called on all processes and in order, also pass `in_order=True`.

```py
from accelerate.logging import get_logger

logger = get_logger(__name__, log_level="DEBUG")
# log all processes
logger.debug("thing_to_log", main_process_only=False)
# log all processes in order
logger.debug("thing_to_log", main_process_only=False, in_order=True)
```

## Hanging code and timeout errors

There can be many reasons why your code is hanging. Let's take a look at how to solve some of the most common issues that can cause your code to hang.

### Mismatched tensor shapes

Mismatched tensor shapes is a common issue that can cause your code to hang for a significant amount of time on a distributed setup.

When running scripts in a distributed setup, functions such as [`Accelerator.gather`] and [`Accelerator.reduce`] are necessary to grab tensors across devices to collectively perform operations on them. These (and other) functions rely on `torch.distributed` to perform a `gather` operation, which requires tensors to have the **exact same shape** across all processes. When the tensor shapes don't match, your code hangs and you'll eventually hit a timeout exception.

You can use Accelerate's operational debug mode to immediately catch this issue. We recommend enabling this mode during the `accelerate config` setup, but you can also enable it from the CLI, as an environment variable, or by manually editing the `config.yaml` file.

<hfoptions id="mismatch">
<hfoption id="CLI">

```bash
accelerate launch --debug {my_script.py} --arg1 --arg2
```

</hfoption>
<hfoption id="environment variable">

If enabling debug mode as an environment variable, you don't need to call `accelerate launch`.

```bash
ACCELERATE_DEBUG_MODE="1" torchrun {my_script.py} --arg1 --arg2
```

</hfoption>
<hfoption id="config.yaml">

Add `debug: true` to your `config.yaml` file.

```yaml
compute_environment: LOCAL_MACHINE
debug: true
```

</hfoption>
</hfoptions>

Once you enable debug mode, you should get a traceback that points to the tensor shape mismatch issue.

```py
Traceback (most recent call last):
  File "/home/zach_mueller_huggingface_co/test.py", line 18, in <module>
    main()
  File "/home/zach_mueller_huggingface_co/test.py", line 15, in main
    broadcast_tensor = broadcast(tensor)
  File "/home/zach_mueller_huggingface_co/accelerate/src/accelerate/utils/operations.py", line 303, in wrapper
accelerate.utils.operations.DistributedOperationException:

Cannot apply desired operation due to shape mismatches. All shapes across devices must be valid.

Operation: `accelerate.utils.operations.broadcast`
Input shapes:
  - Process 0: [1, 5]
  - Process 1: [1, 2, 5]
```

### Early stopping

For early stopping in distributed training, if each process has a specific stopping condition (e.g. validation loss), it may not be synchronized across all processes. As a result, a break can happen on process 0 but not on process 1 which will cause your code to hang indefinitely until a timeout occurs.

If you have early stopping conditionals, use the `set_trigger` and `check_trigger` methods to make sure all the processes
are ended correctly.

```py
# Assume `should_do_breakpoint` is a custom-defined function that returns a conditional, 
# and that conditional might be true only on process 1
if should_do_breakpoint(loss):
    accelerator.set_trigger()

# Later in the training script when we need to check for the breakpoint
if accelerator.check_trigger():
    break
```

### Low kernel versions on Linux

On Linux with kernel version < 5.5, hanging processes have been reported. To avoid this problem, upgrade your system to a later kernel version.

### MPI

If your distributed CPU training job using MPI is hanging, ensure that you have
[passwordless SSH](https://www.open-mpi.org/faq/?category=rsh#ssh-keys) setup (using keys) between the nodes. This means
that for all nodes in your hostfile, you should be able to SSH from one node to another without being prompted for a password.

Next, try to run the `mpirun` command as a sanity check. For example, the command below should print out the
hostnames for each of the nodes.

```bash
mpirun -f hostfile -n {number of nodes} -ppn 1 hostname
```

## Out-of-Memory

One of the most frustrating errors when it comes to running training scripts is hitting "Out-of-Memory" on devices like CUDA, XPU or CPU. The entire script needs to be restarted and any progress is lost.

To address this problem, Accelerate provides the [`find_executable_batch_size`] utility that is heavily based on [toma](https://github.com/BlackHC/toma).
This utility retries code that fails due to OOM (out-of-memory) conditions and automatically lowers batch sizes. For each OOM condition, the algorithm decreases the batch size by half and retries the code until it succeeds.

To use [`find_executable_batch_size`], restructure your training function to include an inner function with `find_executable_batch_size` and build your dataloaders inside it. At a minimum, this only takes 4 new lines of code.

<Tip warning={true}> 

The inner function **must** take batch size as the first parameter, but we do not pass one to it when called. The wrapper will handle this for you. Any object (models, optimizers) that consumes device memory and is passed to the [`Accelerator`] also **must** be declared inside the inner function.

</Tip>

```diff
def training_function(args):
    accelerator = Accelerator()

+   @find_executable_batch_size(starting_batch_size=args.batch_size)
+   def inner_training_loop(batch_size):
+       nonlocal accelerator # Ensure they can be used in our context
+       accelerator.free_memory() # Free all lingering references
        model = get_model()
        model.to(accelerator.device)
        optimizer = get_optimizer()
        train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)
        lr_scheduler = get_scheduler(
            optimizer, 
            num_training_steps=len(train_dataloader)*num_epochs
        )
        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
            model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
        )
        train(model, optimizer, train_dataloader, lr_scheduler)
        validate(model, eval_dataloader)
+   inner_training_loop()
```

## Non-reproducible results between device setups

If you changed the device setup and observe different model performance, it is likely you didn't update your script when moving from one setup to another. Even if you're using the same script with the same batch size, the results will still be different on a TPU, multi-GPU, and single GPU.

For example, if you were training on a single GPU with a batch size of 16 and you move to a dual GPU setup, you need to change the batch size to 8 to have the same effective batch size. This is because when training with Accelerate, the batch size passed to the dataloader is the **batch size per GPU**.

To make sure you can reproduce the results between the setups, make sure to use the same seed, adjust the batch size accordingly, and consider scaling the learning rate.

For more details and a quick reference for batch sizes, check out the [Comparing performance between different device setups](../concept_guides/performance) guide.

## Performance issues on different GPUs

If your multi-GPU setup consists of different GPUs, you may encounter some performance issues:

- There may be an imbalance in GPU memory between the GPUs. In this case, the GPU with the smaller memory will limit the batch size or the size of the model that can be loaded onto the GPUs.
- If you are using GPUs with different performance profiles, the performance will be driven by the slowest GPU you are using because the other GPUs will have to wait for it to complete its workload.

Vastly different GPUs within the same setup can lead to performance bottlenecks.

## Ask for help

If none of the solutions and advice here helped resolve your issue, you can always reach out to the community and Accelerate team for help.

- Ask for help on the Hugging Face forums by posting your question in the [Accelerate category](https://discuss.huggingface.co/c/accelerate/18). Make sure to write a descriptive post with relevant context about your setup and reproducible code to maximize the likelihood that your problem is solved!

- Post a question on [Discord](http://hf.co/join/discord), and let the team and the community help you.

- Create an Issue on the Accelerate [GitHub repository](https://github.com/huggingface/accelerate/issues) if you think you've found a bug related to the library. Include context regarding the bug and details about your distributed setup to help us better figure out what's wrong and how we can fix it.


================================================
FILE: docs/source/concept_guides/big_model_inference.md
================================================
<!--Copyright 2022 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Loading big models into memory

When loading a pre-trained model in PyTorch, the usual workflow looks like this:

```py
import torch

my_model = ModelClass(...)
state_dict = torch.load(checkpoint_file)
my_model.load_state_dict(state_dict)
```

In plain English, those steps are:
1. Create the model with randomly initialized weights
2. Load the model weights (in a dictionary usually called a state dict) from the disk
3. Load those weights inside the model

While this works very well for regularly sized models, this workflow has some clear limitations when we deal with a huge model: in step 1, we load a full version of the model in RAM, and spend some time randomly initializing the weights (which will be discarded in step 3). In step 2, we load another full version of the model in RAM, with the pre-trained weights. If you're loading a model with 6 billion parameters, this means you will need 24GB of RAM for each copy of the model, so 48GB in total (half of it to load the model in FP16).

<Tip warning={true}>

This API is quite new and still in its experimental stage. While we strive to provide a stable API, it's possible some small parts of the public API will change in the future.

</Tip>

## How the Process Works: A Quick Overview

<Youtube id="MWCSGj9jEAo" />

## How the Process Works: Working with Code

### Instantiating an empty model

The first tool Accelerate introduces to help with big models is a context manager [`init_empty_weights`] that helps you initialize a model without using any RAM so that step 1 can be done on models of any size. Here is how it works:

```py
from accelerate import init_empty_weights

with init_empty_weights():
    my_model = ModelClass(...)
```

For instance:

```py
with init_empty_weights():
    model = nn.Sequential(*[nn.Linear(10000, 10000) for _ in range(1000)])
```

initializes an empty model with a bit more than 100B parameters. Behind the scenes, this relies on the meta device introduced in PyTorch 1.9. During the initialization under the context manager, each time a parameter is created, it is instantly moved to that device.

<Tip warning={true}>

    You can't move a model initialized like this on CPU or another device directly, since it doesn't have any data. It's also very likely that a forward pass with that empty model will fail, as not all operations are supported on the meta device.

</Tip>

### Sharded checkpoints

It's possible your model is so big that even a single copy won't fit in RAM. That doesn't mean it can't be loaded: if you have one or several GPUs, this is more memory available to store your model. In this case, it's better if your checkpoint is split into several smaller files that we call checkpoint shards.

Accelerate will handle sharded checkpoints as long as you follow the following format: your checkpoint should be in a folder, with several files containing the partial state dicts, and there should be an index in the JSON format that contains a dictionary mapping parameter names to the file containing their weights. You can easily shard your model with [`~Accelerator.save_model`]. For instance, we could have a folder containing:

```bash
first_state_dict.bin
index.json
second_state_dict.bin
```

with index.json being the following file:

```
{
  "linear1.weight": "first_state_dict.bin",
  "linear1.bias": "first_state_dict.bin",
  "linear2.weight": "second_state_dict.bin",
  "linear2.bias": "second_state_dict.bin"
}
```

and `first_state_dict.bin` containing the weights for `"linear1.weight"` and `"linear1.bias"`, `second_state_dict.bin` the ones for `"linear2.weight"` and `"linear2.bias"`

### Loading weights

The second tool Accelerate introduces is a function [`load_checkpoint_and_dispatch`], that will allow you to load a checkpoint inside your empty model. This supports full checkpoints (a single file containing the whole state dict) as well as sharded checkpoints. It will also automatically dispatch those weights across the devices you have available (GPUs, CPU RAM), so if you are loading a sharded checkpoint, the maximum RAM usage will be the size of the biggest shard.

If you want to use big model inference with Transformers models, check out this [documentation](https://huggingface.co/docs/transformers/main/en/main_classes/model#large-model-loading).

Here is how we can use this to load the [GPT2-1.5B](https://huggingface.co/marcsun13/gpt2-xl-linear-sharded) model.

Let's download the sharded version of this model.

```bash
pip install huggingface_hub
```

```py
from huggingface_hub import snapshot_download
checkpoint = "marcsun13/gpt2-xl-linear-sharded"
weights_location = snapshot_download(repo_id=checkpoint)
```

In order to initialize the model, we will use the library minGPT. 

```bash
git clone https://github.com/karpathy/minGPT.git
pip install minGPT/
```

```py
from accelerate import init_empty_weights
from mingpt.model import GPT

model_config = GPT.get_default_config()
model_config.model_type = 'gpt2-xl'
model_config.vocab_size = 50257
model_config.block_size = 1024

with init_empty_weights():
    model = GPT(model_config)
```

Then, load the checkpoint we just downloaded with:

```py
from accelerate import load_checkpoint_and_dispatch

model = load_checkpoint_and_dispatch(
    model, checkpoint=weights_location, device_map="auto", no_split_module_classes=['Block']
)
```

By passing `device_map="auto"`, we tell Accelerate to determine automatically where to put each layer of the model depending on the available resources:
- first, we use the maximum space available on the GPU(s)
- if we still need space, we store the remaining weights on the CPU
- if there is not enough RAM, we store the remaining weights on the hard drive as memory-mapped tensors


#### `no_split_module_classes`

This parameter will indicate that some of the modules with the name `"Block"` should not be split across different devices. You should set here all blocks that 
include a residual connection of some kind.


#### The `device_map`

You can see the `device_map` that Accelerate picked by accessing the `hf_device_map` attribute of your model:

```py
model.hf_device_map
```

```python out
{'transformer.wte': 0,
 'transformer.wpe': 0,
 'transformer.drop': 0,
 'transformer.h.0': 0,
 ...
 'transformer.h.21': 0, 
 'transformer.h.22': 1, 
 'transformer.h.23': 1, 
 'transformer.h.24': 1,
 ...
 'transformer.h.47': 1, 
 'transformer.ln_f': 1, 
 'lm_head': 1}
 ```

It's fully possible to create your own device map for the layers to use as well, specifying the GPU device to use (a number), `"cpu"`, or `"disk"` and pass this in:

```python
device_map = {
    "transformer.wte": "cpu",
    "transformer.wpe": 0,
    "transformer.drop": "cpu",
    "transformer.h.0": "disk"
}

model = load_checkpoint_and_dispatch(
    model, checkpoint=weights_location, device_map=device_map
)

```

### Run the model

Now that we have done this, our model lies across several devices, and maybe the hard drive. But it can still be used as a regular PyTorch model:

```py
from mingpt.bpe import BPETokenizer
tokenizer = BPETokenizer()
inputs = tokenizer("Hello, my name is").to(0)

outputs = model.generate(x1, max_new_tokens=10, do_sample=False)[0]
tokenizer.decode(outputs.cpu().squeeze())
```

Behind the scenes, Accelerate added hooks to the model, so that:
- at each layer, the inputs are put on the right device (so even if your model is spread across several GPUs, it works)
- for the weights offloaded on the CPU, they are put on a GPU just before the forward pass and cleaned up just after
- for the weights offloaded on the hard drive, they are loaded in RAM then put on a GPU just before the forward pass and cleaned up just after

This way, your model can run for inference even if it doesn't fit on one of the GPUs or the CPU RAM!

<Tip warning={true}>

    This only supports the inference of your model, not training. Most of the computation happens behind `torch.no_grad()` context managers to avoid spending some GPU memory with intermediate activations.

</Tip>

### Designing a device map

You can let Accelerate handle the device map computation by setting `device_map` to one of the supported options (`"auto"`, `"balanced"`, `"balanced_low_0"`, `"sequential"`) or create one yourself if you want more control over where each layer should go.

<Tip>

    You can derive all sizes of the model (and thus compute a `device_map`) on a model that is on the meta device.

</Tip>

All the options will produce the same result when you don't have enough GPU memory to accommodate the whole model (which is to fit everything that can on the GPU, then offload weights on the CPU or even on the disk if there is not enough RAM). 

When you have more GPU memory available than the model size, here is the difference between each option:
- `"auto"` and `"balanced"` evenly split the model on all available GPUs, making it possible for you to use a batch size greater than 1.
- `"balanced_low_0"` evenly splits the model on all GPUs except the first one, and only puts on GPU 0 what does not fit on the others. This option is great when you need to use GPU 0 for some processing of the outputs, like when using the `generate` function for Transformers models
- `"sequential"` will fit what it can on GPU 0, then move on GPU 1 and so forth (so won't use the last GPUs if it doesn't need to).

<Tip>

    The options `"auto"` and `"balanced"` produce the same results for now, but the behavior of `"auto"` might change in the future if we find a strategy that makes more sense, while `"balanced"` will stay stable.

</Tip>

First note that you can limit the memory used on each GPU by using the `max_memory` argument (available in [`infer_auto_device_map`] and in all functions using it). When setting `max_memory`, you should pass along a dictionary containing the GPU identifiers (for instance `0`, `1` etc.) and the `"cpu"` key for the maximum RAM you want to use for CPU offload. The values can either be an integer (in bytes) or a string representing a number with its unit, such as `"10GiB"` or `"10GB"`.

Here is an example where we don't want to use more than 10GiB on each of the two GPUs and no more than 30GiB of CPU RAM for the model weights:

```python
from accelerate import infer_auto_device_map

device_map = infer_auto_device_map(my_model, max_memory={0: "10GiB", 1: "10GiB", "cpu": "30GiB"})
```

<Tip warning={true}>

    When a first allocation happens in PyTorch, it loads CUDA kernels which take about 1-2GB of memory depending on the GPU. Therefore you always have less usable memory than the actual size of the GPU. To see how much memory is actually used do `torch.ones(1).cuda()` and look at the memory usage.

    Therefore when you create memory maps with `max_memory` make sure to adjust the available memory accordingly to avoid out-of-memory errors.

</Tip>

Additionally, if you do some additional operations with your outputs without placing them back on the CPU (for instance inside the `generate` method of Transformers) and if you placed your inputs on a GPU, that GPU will consume more memory than the others (Accelerate always place the output back to the device of the input). Therefore if you would like to optimize the maximum batch size and you have many GPUs, give the first GPU less memory. For example, with BLOOM-176B on 8x80 A100 setup, the close-to-ideal map is:

```python
max_memory = {0: "30GIB", 1: "46GIB", 2: "46GIB", 3: "46GIB", 4: "46GIB", 5: "46GIB", 6: "46GIB", 7: "46GIB"}
```
as you can see we gave the remaining 7 GPUs ~50% more memory than GPU 0.

If you opt to fully design the `device_map` yourself, it should be a dictionary with keys being module names of your model and values being a valid device identifier (for instance an integer for the GPUs) or `"cpu"` for CPU offload, `"disk"` for disk offload. The keys need to cover the whole model, you can then define your device map as you wish: for instance, if your model has two blocks (let's say `block1` and `block2`) which each contain three linear layers (let's say `linear1`, `linear2` and `linear3`), a valid device map can be:

```python
device_map = {"block1": 0, "block2": 1}
```

another one that is valid could be:

```python
device_map = {"block1": 0, "block2.linear1": 0, "block2.linear2": 1, "block2.linear3": 1}
```

On the other hand, this one is not valid as it does not cover every parameter of the model:

```python
device_map = {"block1": 0, "block2.linear1": 1, "block2.linear2": 1}
```

<Tip>

    To be the most efficient, make sure your device map puts the parameters on the GPUs in a sequential manner (e.g. don't put one of the first weights on GPU 0, then weights on GPU 1 and the last weight back to GPU 0) to avoid making many transfers of data between the GPUs.

</Tip>

## CPU offload only

If you want to offload your model on CPU, you can use [`cpu_offload`]. As a result, all parameters of the model will be offloaded and only one copy of the state dict of the model will be kept. During the forward pass, parameters will be extracted from that state dict and put on the execution device and passed as they are needed, then offloaded again. 

```python
cpu_offload(model, execution_device)
```

You can also use [`cpu_offload_with_hook`]. This function will offloads a model on the CPU and puts it back to an execution device when executed. The difference with [`cpu_offload`] is that the model stays on the execution device after the forward and is only offloaded again when the `offload` method of the returned `hook` is called. Furthermore, [`cpu_offload_with_hook`] is more performant but less memory saving. It is useful for pipelines running a model in a loop:

```python
model_1, hook_1 = cpu_offload_with_hook(model_1, execution_device)
model_2, hook_2 = cpu_offload_with_hook(model_2, execution_device, prev_module_hook=hook_1)
model_3, hook_3 = cpu_offload_with_hook(model_3, execution_device, prev_module_hook=hook_2)

hid_1 = model_1(input)
for i in range(50):
    # model1 is offloaded on the CPU at the first iteration, model 2 stays on the GPU for this whole loop.
    hid_2 = model_2(hid_1)
# model2 is offloaded to the CPU just before this forward.
hid_3 = model_3(hid_3)

# For model3, you need to manually call the hook offload method.
hook_3.offload()
```

## Disk offload only

To perform disk offload, you can use [`disk_offload`]. As a result, all parameters of the model will be offloaded as memory-mapped array in a given folder. During the forward pass, parameters will be accessed from that folder and put on the execution device passed as they are needed, then offloaded again.

```python
disk_offload(model, offload_dir, execution_device)
```

## Limits and further development

We are aware of the current limitations in the API:

- [`infer_auto_device_map`] (or `device_map="auto"` in [`load_checkpoint_and_dispatch`]) tries to maximize GPU and CPU RAM it sees available when you execute it. While PyTorch is very good at managing GPU RAM efficiently (and giving it back when not needed), it's not entirely true with Python and CPU RAM. Therefore, an automatically computed device map might be too intense on the CPU. Move a few modules to the disk device if you get crashes due to a lack of RAM.
- [`infer_auto_device_map`] (or `device_map="auto"` in [`load_checkpoint_and_dispatch`]) attributes devices sequentially (to avoid moving things back and forth) so if your first layer is bigger than the size of the GPU you have, it will end up with everything on the CPU/Disk.
- [`load_checkpoint_and_dispatch`] and [`load_checkpoint_in_model`] do not perform any check on the correctness of your state dict compared to your model at the moment (this will be fixed in a future version), so you may get some weird errors if trying to load a checkpoint with mismatched or missing keys.
- The model parallelism used when your model is split on several GPUs is naive and not optimized, meaning that only one GPU works at a given time and the other sits idle.
- When weights are offloaded on the CPU/hard drive, there is no pre-fetching (yet, we will work on this for future versions) which means the weights are put on the GPU when they are needed and not before.
- Hard-drive offloading might be very slow if the hardware you run on does not have fast communication between disk and CPU (like NVMes).


================================================
FILE: docs/source/concept_guides/context_parallelism.md
================================================
<!--Copyright 2025 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Context Parallel in 🤗`accelerate`

This guide will cover basics of using context parallelism in 🤗`accelerate`, for the more curious readers, we will also cover some technicalities in the later sections.

See also the very related [Guide to Sequence Parallellism](./sequence_parallelism.md).

## Why context parallelism?

With the advent of large language models, and recently reasoning models, the sequence length has been growing rapidly. This, combined with quadratic memory complexity of attention, has led to a need for more efficient ways to train models with long sequences.
With sequence length of 128k, the memory requirement of the attention matrix is `128k * 128k * 2 bytes * num_heads = ~32 GB * num_heads` for `bf16` precision, given vanilla attention implementation. Granted, with usage of `flash attention` or `SDPA` which do not materialize these attention weights, this decreases drastically, but the growth in memory requirements is still considerable.

Context parallelism allows us to shard the inputs to the attention computation along the sequence dimension and compute the attention in parallel on multiple GPUs. With this, we can train models with long sequences, scaling potentially to 1M+ sequence length.

## How to use context parallelism?

```diff
from accelerate.utils import ParallelismConfig, TorchContextParallelConfig

+ cp_config = TorchContextParallelConfig(
+       cp_comm_strategy="alltoall", # no need to use cp_config at all, if you want to use the default "allgather"
+ )

+ parallelism_config = ParallelismConfig(
+     cp_size=8,
+     cp_handler=cp_config,  # or just cp_size=8, if you want to use the default "allgather"
+ )

accelerator = Accelerator(
    ...,
    parallelism_config=parallelism_config,
)
```

As with any other feature in 🤗`accelerate`, you can enable context parallelism also by passing the corresponding flags to `accelerate launch`.
In this case, it's no different:

```bash
accelerate launch --parallelism-config-cp-size 8 --parallelism-config-cp-comm-strategy [allgather|alltoall] ...
```

> [!Tip]
> You can also set the `cp_size` and `cp_comm_strategy` in the `accelerate config` command, which will save them in your `accelerate` configuration file, so you don't have to pass them every time you launch your script.

> [!Tip]
> Context parallelism is compatible with other parallelism strategies, such as data parallelism, tensor parallelism and FSDP2.
> You can simply combine them by setting your parallelism sizes to the desired values, e.g. `--parallelism-config-dp-size 8 --parallelism-config-tp-size 2 --parallelism-config-cp-size 8`. Or you can use the `ParallelismConfig` class to set them programmatically.

> [!Warning]
> Context parallelism is tightly coupled  with `FSDP2`, which you can learn more about in the [FSDP2 introduction](fsdp1_vs_fsdp2.md). Meaning, context parallelism only works if you use `FullyShardedDataParallelPlugin` or `--use-fsdp` with version set to 2 to your
> program. If no `FSDP2` is used, error will be raised.

> [!Warning]
> Context parallelism works only with [SDPA](https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) and only with no mask or causal mask. We can't properly detect this for you, so it's your responsibility to ensure that you are using `SDPA` with no mask or causal mask. If you use any other attention implementation, it will raise an error.

After enabling context parallelism with the methods mentioned above, you can then apply it to your training loop. We provide a thin wrapper around [`torch.distributed.tensor.experimental.context_parallel`](https://docs.pytorch.org/docs/stable/distributed.tensor.html#torch.distributed.tensor.experimental.context_parallel) that you can use in your training loop, that abstracts some of the complexity of using it (more on this later). To minimize the changes you have to do in your training loop, we provide a context manager that is a `noop` if context parallelism is not enabled, and applies the context parallelism if it is enabled. This way, you can use it in your training loop without changing any code based on your parallelism configuration.
You can use it as follows:

```python
for batch in dataloader:
    with accelerator.maybe_context_parallel(
        buffers=[batch["input_ids"], batch["attention_mask"]],
        buffer_seq_dims=[1, 1],
        no_restore_buffers={batch["input_ids"], batch["labels"]},
    ):
        outputs = model(**batch)
        ...
```

> [!Warning]
> This context manager has to be recreated with each training step, as shown in the example above. It's crucial to do so.

This can scale your context size to 1M+ sequence length potentially. Below, we showcase speed and memory usage of context parallelism for up-to 256k context size. We can see that when we double the context size and number of GPUs, we can achieve consistent memory usage, potentially enabling endless context length scaling.

<p align="center">
  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/accelerate/examples/fsdp2/cp_perf.png" alt="context parallelism memory usage" />
  <br>
  <em>Figure 1: Memory usage and speed of context parallelism for up-to 256k context size.</em>
</p>

> [!Tip]
> These examples were created with a script you can find [in the examples folder](https://github.com/huggingface/accelerate/blob/main/examples/fsdp2/nd_parallel.py). To run the example on 8 H100 GPUs (128k sequence length), you can use the following command:
> ```bash
> accelerate launch --use-fsdp --fsdp-activation-checkpointing=TRUE examples/fsdp2/nd_parallel.py --cp-size=8 --sequence-length=128000
> ```


## Accelerate's interface

The context manager takes a few arguments, that are used to configure the context parallelism.

- `buffers`: This is a list of tensors that are to be sharded across the sequence dimension. These tensors are usually input ids, labels and attention mask.
- `buffer_seq_dims`: This is a list of integers, that specify the sequence dimension of the buffers, in the order of the `buffers` list. If you pass `buffers=[input_ids, shift_labels]` with both having shape `[batch_size, sequence_length]`, you would pass `buffer_seq_dims=[1, 1]`.
                     as the sequence dimension is the second dimension of the tensors. This is required for correct computation of the model outputs.
- `no_restore_buffers`: The implementation of context parallelism modifies the buffers in-place, converting them to `torch.distributed.tensor.Dtensor`s. After the context manager exits, a communication kernel would need to be launched to restore the buffers to their original state (usually all-gather). This takes some time, so it is recommended to pass the same tensors as in the `buffers` argument, to avoid unnecessary communication, unless you are sure that you need to use the buffers after the context manager exits.


> [!Warning]
> Context parallelism is not compatible with `labels` that are a copy of `input_ids`, which models from 🤗 transformers can shift to enable causal language modeling themselves.
> Imagine this case:
> labels = [l1, l2, l3, l4, ... li]
> if we apply context parallelism, each rank would end up with a part of labels, such as this:
> labels_rank_0 = [l1, l2], labels_rank_1 = [l3, l4], ...
> after transformers modelling code shifts the labels, it would end up with:
> labels_rank_0 = [l2, PAD], labels_rank_1 = [l3, PAD], ...
> where `PAD` is a padding token. This would result in incorrect loss computation, as the labels are not aligned with the inputs anymore.
> Because of this, you need to manually shift the labels before passing them in the model


## Configurable options
Accelerate provides only a single option to configure context parallelism (except for `cp_size`)

- `cp_comm_strategy`: The rotation method to use for the shards. We strongly recommend keeping this as `"allgather"`, as it's very likely it will outperform `"alltoall"` in most cases.

Context parallel size is rather self-explanatory, it's the number of ranks across which the inputs are to be-sharded.
Context parallel shard rotation defines how the shards of the inputs are rotated across ranks. We'll cover the 2 options in more detail in the next section.

You can see an end-to-end example in the [ND parallel example](https://github.com/huggingface/accelerate/blob/main/examples/fsdp2/nd_parallel.py) file, where you can train an 8B model with up-to 128k context length on a single 8xH100 node. Using multi-node training, you can scale this to 1M+ sequence length on multiple GPUs. You can also seamlessly combine it with other parallelism strategies to fit your needs.

## Technical details

> [!Tip]
> This section is fairly technical, so if you don't need to learn the internals of context parallelism, you can skip it and start building 🚀

We're going to be using word `shard` extensively in the following sections, so let's define it first. If we call tensor `sharded` across `Dth` dimension, across `N` ranks, we mean that this tensor is split into `N` parts, where each part of the tensor has shape `[..., D//N, ...]`.


## So how does it work?

Context parallelism works on sharding the `Q, K and V` matrices across the sequence dimension. Each rank has its assigned shard of `Q`, let's call it `Q_i`. This matrix stays only on this rank, during the whole computation. Similarly, each rank has its own shard of `K` and `V`, let's call them `K_i` and `V_i`. Then, each rank calculates attention with its own shard of `Q_i`, `K_i` and `V_i`, let's call it `attn_i`. During this computation, a communication kernel is launched to gather the `Ks` and `Vs` from all other ranks. What communication primitive is used, depends on the `context_parallel_shard_rotation` option.
This way, each rank gets to calculate local attention, first with `Q_i`, `K_i` and `V_i`, then with `K_j` and `V_j` from all other ranks. As each rank holds `Q, K and V` matrices that are sharded across the sequence dimension, the resulting matrices are smaller and can fit on a single GPU.

We can formalize this in the following pseudocode:
```python
comm_kernel = {"allgather": allgather, "alltoall": alltoall}[context_parallel_shard_rotation]
Qi, Ki, Vi = shard(Q, K, V, seq_dim)
attn[i] = attn(Qi, Ki, Vi)
for j in range(context_parallel_size):
    Kj, Vj = comm_kernel()
    attn[j] = attn(Qi, Kj, Vj) # [batch, num_heads, seq_len // context_parallel_size, head_dim]

final_attn = combine(attn)
```

## all-to-all vs all-gather

### all-gather
So what's the difference between all-to-all and all-gather? With all-gather, the communication is very simple. After (well, before, as it usually takes longer) we compute the local attention `attn_i` we launch an all-gather to gather all other `Ks` and `Vs` from all other ranks. As this communication is done, each rank has all the `Ks` and `Vs` from all other ranks, and can compute the attention with them sequentially.
In ideal scenario, all-gather finishes in the exact moment as the calculation of `attn_i` is done. However, this never happens in practice, so the ideal real overlap is achieved when the full `attn_i` is overlapped with a part of the communication, then to start the computation with `K_j` and `V_j`, we wait for the all-gather to finish.

### all-to-all
All-to-all, or sometimes called `ring-rotation` utilizes a ring-like communication pattern. After concluding `attn_i` computation, an all-to-all is launched to send `K_i` and `V_i` to the neighbouring ranks. We then repeat this `context_parallel_size-1` times, so that each rank sees all the shards of `K` and `V` from all other ranks once. In ideal scenario, we prefetch shards `K_i+1` and `V_i+1` from the neighbouring rank and this communication is exactly overlapped with computation of our current `attn_i`. Again, realistically, this perfect overlap doesn't ever happen. Given the nature of this approach, if we don't achieve perfect overlap, the penalty is way larger than with all-gather.

## How to choose the right rotation method?
In theory, all-to-all should be the better choice. Though in practice, it rarely is. Therefore, we default to all-gather, as it's more likely to achieve better performance. Extensive [benchmarks](https://discuss.pytorch.org/t/distributed-w-torchtitan-breaking-barriers-training-long-context-llms-with-1m-sequence-length-in-pytorch-using-context-parallel/215082) from the `torchtitan` team also show that all-to-all rarely outperforms all-gather. Though, we still provide both options, as you might find one to be better for your use case.

You can directly see this issue in the profiler output in the image below:
<p align="center">
  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/accelerate/examples/fsdp2/cp_all_to_all.png" alt="all-to-all profiler output" />
  <br>
  <em>Figure 1: In red you can see the idle time, while we wait for the all-to-all kernel to finish. Highlighted in the first blue bar, you can see that it takes ~250us to finish, which is repeated N-1 times for each attention call, where N is the context parallel size.</em>
</p>


## Why only FSDP2?

We only support context parallelism with `FSDP2`, as we create a joint mesh of `context_parallel_size` and `dp_shard_size` to
utilize its full potential.
How it works is: we shard the model across the joint mesh of size `cp_size*dp_shard_size`, which maximizes the memory savings.
This is a "free lunch" of sorts, as `FSDP` communication is fully overlapped with the computation of attention, as shown in the images below.

<p align="center">
  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/accelerate/examples/fsdp2/cp_why_fsdp2.png" alt="why FSDP2+CP" />
  <br>
  <em>Figure 2: In blue rectangles (Stream 23), you can see that the pre-fetch of `FSDP` shard is fully overlapped with the computation of attention (Stream 7), while in red rectangles (Stream 24), you can see that the all-gather kernel results in a bubble of idle time, in which our compute stream (7) is idle.</em>
</p>

In the figure above, you can also note the difference between all-to-all and all-gather. While in all-to-all (Figure 1), we launch a communication kernel N-1 times for each attention call, in all-gather (Figure 2), we launch a communication kernel only once. This results in a bigger bubble, but it only happens once per attention call, while in all-to-all, it happens N-1 times.

## Data dispatching in joint mesh

We make sure to dispatch the same batch of data to the whole `cp` subgroup, so that the results are correct. (Meaning each rank in `cp` subgroup gets the same batch of data.) However, we also dispatch different batches to each rank of `dp_shard` group.
Imagine it like this:
```
# 8 GPUS, --dp_shard_size 4, --cp_size 2
# mesh = [[0, 1], [2, 3], [4, 5], [6, 7]]
# model is sharded across the whole mesh (each GPU holds 1/8 of the model)
# GPUs 0,1 = batch 0
# GPUs 2,3 = batch 1
... and so on.
```


================================================
FILE: docs/source/concept_guides/deferring_execution.md
================================================
<!--Copyright 2022 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Executing and deferring jobs

When you run your usual script, instructions are executed in order. Using Accelerate to deploy your script on several
GPUs at the same time introduces a complication: while each process executes all instructions in order, some may be
faster than others.

You might need to wait for all processes to have reached a certain point before executing a given instruction. For
instance, you shouldn't save a model before being sure every process is done with training, and you wouldn't want to 
continue training before all the model weights have been loaded in. To do this, just write the following line in your code:

```
accelerator.wait_for_everyone()
```

This instruction will block all the processes that arrive first until all the other processes have reached that
point (if you run your script on just one GPU or CPU, this won't do anything).

A few example cases of when to use this utility are listed below:

<Tip>

    Some of these are utilized with the [`~Accelerator.main_process_first`] context manager, which utilizes [`~Accelerator.wait_for_everyone`] to 
    run a particular set of code on the main process beforehand before triggering and launching the other processes

</Tip>

## Downloading a Dataset 

When downloading a dataset, you should download it first on the main process and then load the cached dataset afterward

<Tip>

    `load_dataset` will perform a lock under the hood to stop multiple downloads from happening at once, but if you are downloading something 
    not using this library you should use this method.
    
</Tip>

```python
with accelerator.main_process_first():
    datasets = load_dataset("glue", "mrpc")
```

Under the hood this is the same as calling: 

```python
# First do something on the main process
if accelerator.is_main_process:
    datasets = load_dataset("glue", "mrpc")
else:
    accelerator.wait_for_everyone()

# And then send it to the rest of them
if not accelerator.is_main_process:
    datasets = load_dataset("glue", "mrpc")
else:
    accelerator.wait_for_everyone()
```

## Saving the `state_dict`

When saving the `state_dict` of the model, since you would normally save one file on just the main process
you should specify that:

```python
if accelerator.is_main_process:
    model = accelerator.unwrap_model(model)
    torch.save(model.state_dict(), "weights.pth")
```

## Loading in the `state_dict`

When loading in the `state_dict` to a model, optimizer, or scheduler, you should wait 
for all workers to have the weights loaded in before moving on to training

```python
with accelerator.main_process_first():
    state = torch.load("weights.pth")
    model.load_state_dict(state)
```

## Applying a multi-worker CPU operation 

Applying a `map()` operation on multiple workers, such as tokenizing should be done on the 
main process first, and then propagated to each one. 

```python
datasets = load_dataset("glue", "mrpc")

with accelerator.main_process_first():
    tokenized_datasets = datasets.map(
        tokenize_function,
        batched=True,
        remove_columns=["idx", "sentence1", "sentence2"],
    )
```

## Applying checks such as Early Stopping

To have a check that works with a flag set by a particular process, the `set_trigger` and `check_trigger` API should be used. Useful examples
for doing so can include situations such as using early stopping and monitoring the loss (as each loss slightly differs on each process).

Call [`Accelerator.set_trigger`] when your condition has been met, and [`Accelerator.check_trigger`] when checking if that condition has been met in any process:

```python
for (x,y) in data_loader:
    logits = model(x)
    loss = loss_func(logits, y)
    # Assume `should_do_early_stopping` is a custom defined function that returns a conditional
    if should_do_early_stopping(loss):
        accelerator.set_trigger()

    # Later in the training script when we need to check for the breakpoint
    if accelerator.check_trigger():
        break
```


================================================
FILE: docs/source/concept_guides/fsdp1_vs_fsdp2.md
================================================
<!--Copyright 2025 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# FSDP1 vs FSDP2

This guide explains the key differences between `FSDP1` and `FSDP2` and helps you migrate your existing code to use `FSDP2` with minimal changes.

## How is FSDP2 better than FSDP1?

First, we want to understand how `FSDP1` and `FSDP2` work internally to understand the differences between them. This also helps us understand the limitations of `FSDP1` and how `FSDP2` solves them.

We'll be discussing a scenario where we have a single `Layer` that contains 3 `Linear` layers and is wrapped using `FSDP` to be sharded across 2 GPUs.

<div align="center">
  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/accelerate/layer.png" alt="Layer">
</div>

### FSDP1
First, we have to understand the original `FSDP1` and the limitations it brings. It represents each `FSDP` module as a single `FlatParameter` which is a single 1D tensor that contains all of the module parameters, which then get sharded across ranks. I.e. if you wrap the `Layer` with `FSDP1`, you'd achieve something as such:

<div align="center">
  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/accelerate/fsdp1.png" alt="FSDP1">
</div>

You might notice a problem. The whole `Layer` gets flattened into a single `FlatParameter`, which then gets sharded across ranks. But if it's a single `FlatParameter` object, how do we store metadata? That is one of the limitations. Properly storing per-parameter metadata such as `dtype`, `requires_grad`, etc. is not possible without some ugly hacks.

### FSDP2
This is why `FSDP2` was introduced. It doesn't use `FlatParameter`, instead it uses `DTensor` which is short for "Distributed Tensor". Each `DTensor` basically represents a vanilla `torch.Tensor` that has been sharded across ranks. It contains metadata about the original `torch.Tensor` and how it's sharded, what is the [placement type](https://pytorch.org/docs/stable/distributed.tensor.html#module-torch.distributed.tensor.placement_types) and so on. This is why it's called `per-parameter sharding`. The following figure shows the difference:

<div align="center">
  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/accelerate/fsdp2.png" alt="FSDP2">
</div>

Each Parameter of the original `Layer` is sharded across the 0th dimension, and split between 2 GPUs. Now, each `Linear` layer is a separate `DTensor` and storing metadata per-parameter is possible and straightforward.


> [!TIP] 
> In the image above, the tensors were sharded across the 1st dimension for the sake of fitting the image on the screen, in reality, they are sharded across the 0th dimension as stated above

## What does FSDP2 offer?

`FSDP2` is a new and improved version of PyTorch's fully-sharded data parallel training API. Its main advantage is using `DTensor` to represent sharded parameters. Compared to `FSDP1`, it offers:
- Simpler internal implementation, where each `Parameter` is a separate `DTensor`
- Enables simple partial parameter freezing because of the above, which makes methods as [`LORA`](https://huggingface.co/papers/2106.09685) work out of the box
- With `DTensor`, `FSDP2` supports mixing `fp8` and other parameter types in the same model out of the box
- Faster and simpler checkpointing without extra communication across ranks using `SHARDED_STATE_DICT` and [`torch.distributed.checkpoint`](https://pytorch.org/docs/stable/distributed.checkpoint.html), this way, each rank only saves its own shard and corresponding metadata
- For loading, it uses a `state_dict` of the sharded model to directly load the sharded parameters
- Support for asynchronous checkpointing, where parameters are first copied to CPU memory, after this, main thread continues training while another thread stores the parameters on disk
- Memory efficiency and deterministic memory usage, `FSDP2` doesn't use `recordStream` anymore and uses stream-to-stream synchronization (for more technical details see [this forum post](https://dev-discuss.pytorch.org/t/fsdp-cudacachingallocator-an-outsider-newb-perspective/1486) and [this issue](https://github.com/pytorch/pytorch/issues/114299))
- In the future, optimizations of the communication patterns via `torch.compile` are planned, further improving the performance and memory efficiency


## API Differences

We have already discussed the internal differences, now let's discuss the differences, you, as a user, will need to know. 

Here are the main changes in configuration options when using `FSDP2` through the `accelerate` CLI:

Previous (`FSDP1`) | New (`FSDP2`) | What Changed
-- | -- | --
`--fsdp_sharding_strategy` | `--fsdp_reshard_after_forward` | replaces `--fsdp_sharding_strategy`, changed to `true` (previously `FULL_SHARD`) or `false` (previously `SHARD_GRAD_OP`)
`--fsdp_backward_prefetch` | \*\***REMOVED**\*\* | `FSDP2` uses previous `BACKWARD_PRE` option by default, as only this allows communication and computation overlap
`--fsdp_forward_prefetch` | \*\***NOT YET IMPLEMENTED**\*\* | How to implement this is under active discussion, for now it is not supported in `FSDP2`
`--fsdp_sync_module_states` | \*\***REMOVED**\*\* | with `FSDP2`, this parameter becomes redundant
`--fsdp_cpu_ram_efficient_loading` | `--fsdp_cpu_ram_efficient_loading` | if `true`, `FSDP2` will similarly load the model only on rank 0, and then parameters get synced to other ranks, this is the same behavior as `FSDP1`, however, setting `--fsdp_sync_module_states` isn't required anymore
`--fsdp_state_dict_type` | `--fsdp_state_dict_type` | `LOCAL_STATE_DICT` becomes obsolete and with `FSDP2` `SHARDED_STATE_DICT` is the default option, which results in no extra communication and each rank saving its own shard, other possible option is `FULL_STATE_DICT` which results in extra communication and spike in memory usage but saves the full model from rank 0.
`--fsdp_use_orig_params` | \*\***REMOVED**\*\* | `FSDP2` uses a `DTensor` class on the background, which means it *always* uses the original parameters by default
\*\***NEW**\*\* | `--fsdp_version` | `1` is the default option, to not break existing code, set to `2` to use `FSDP2`

For all other options that remain unchanged, see the [`FSDP` documentation](../usage_guides/fsdp.md).

## How to Switch to FSDP2

### If using Python code:
Simply set `fsdp_version=2` when creating your plugin and replace options according to the table above.

```python
from accelerate import FullyShardedDataParallelPlugin, Accelerator

fsdp_plugin = FullyShardedDataParallelPlugin(
    fsdp_version=2
    # other options...
)
accelerator = Accelerator(fsdp_plugin=fsdp_plugin)
```

### If using YAML config:
Use our conversion tool:
```bash
accelerate to-fsdp2 --config_file config.yaml --output_file new_config.yaml
```

This will automatically convert all FSDP1 settings to their FSDP2 equivalents. Use `--overwrite` to update the existing file instead of creating a new one.


================================================
FILE: docs/source/concept_guides/fsdp_and_deepspeed.md
================================================
<!--Copyright 2024 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# FSDP vs DeepSpeed

Accelerate offers flexibility of training frameworks, by integrating two extremely powerful tools for distributed training, namely [Pytorch FSDP](../usage_guides/fsdp) and [Microsoft DeepSpeed](../usage_guides/deepspeed). The aim of this tutorial is to draw parallels, as well as to outline potential differences, to empower the user to switch seamlessly between these two frameworks.

<Tip>

  To switch between the frameworks, we recommend launching code `accelerate launch` passing in the correct config file with `--config_file`, or passing in the respective arguments directly for [FSDP and DeepSpeed](../package_reference/cli#accelerate-launch) .

  Example Accelerate configurations can be found here for [DeepSpeed](../usage_guides/deepspeed#accelerate-deepspeed-plugin) and [FSDP](../usage_guides/fsdp#how-it-works-out-of-the-box), or in the [example zoo under "Launch Configurations"](../usage_guides/explore)
 
</Tip>

<Tip warning={true}>

This tutorial is for single-node, multi-GPU, scenarios only.

</Tip>

## Configuring Functionalities

Model tensors are split into different GPUs in an attempt to scale up model sizes; this is termed *sharding* in FSDP, and *partitioning* in DeepSpeed. FSDP sharding and DeepSpeed ZeRO (partitioning) stages are configured by `--fsdp_sharding_strategy`, and `--zero_stage`, respectively.  In particular, FSDP `FULL_SHARD` maps to DeepSpeed ZeRO stage `3`; see this [comprehensive mapping between FSDP sharding and DeepSpeed ZeRO settings](../usage_guides/fsdp#mapping-between-fsdp-sharding-strategies-and-deepspeed-zero-stages). The below table summarizes and groups similar settings:

Group | Framework | Configuration | Example | Restrictions (if any)
--|--|--|--|--
sharding / partitioning | FSDP<br>DeepSpeed | `--fsdp_sharding_strategy`<br>`--zero_stage` | `1` (`FULL_SHARD`) <br>`3` | 
offload | FSDP<br>DeepSpeed | `--fsdp_offload_params`<br>`--offload_param_device`<br>`--offload_optimizer_device` | `true`<br>`cpu`<br>`cpu` | all or nothing <br><br> 
model loading | FSDP<br>DeepSpeed | <span style="white-space:nowrap;">`--fsdp_cpu_ram_efficient_loading`</span><br>`--zero3_init_flag` | `true`<br>`true` | <br>only ZeRO 3
efficient checkpointing | FSDP<br>DeepSpeed | `--fsdp_state_dict_type`<br>`--zero3_save_16bit_model` |  `SHARDED_STATE_DICT`<br>`true` |  <br>only ZeRO 3
weights prefetching | FSDP<br><br>DeepSpeed | `--fsdp_forward_prefetch`<br>`--fsdp_backward_prefetch`<br>None | `true`<br>`BACKWARD_PRE` | <br><br>
model | FSDP<br><br>DeepSpeed |  `--fsdp_auto_wrap_policy`<br><span style="white-space:nowrap;">`--fsdp_transformer_layer_cls_to_wrap`</span><br>None | `TRANSFORMER_BASED_WRAP`<br><Layer Class> |<br>Usually not needed <br>Transparent to user.
parameters summoning | FSDP<br>DeepSpeed | `--fsdp_use_orig_params`<br>None | `true` | required for `torch.compile`<br>Transparent to user
parameters syncing | FSDP<br>DeepSpeed | `--fsdp_sync_module_states`<br>None | `true` | 
training | FSDP<br>DeepSpeed | None<br>`--gradient_accumulation_steps`<br>`--gradient_clipping` | <br>`auto`<br>`auto` | Transparent to user

For detailed descriptions of the above, refer to [`Accelerate` launch documentation](../package_reference/cli#accelerate-launch).

<Tip>

    To access other DeepSpeed configurations, such as mixed precision settings, 
    you need to pass in a `--deepspeed_config_file`, see the [documentation](../usage_guides/deepspeed#deepspeed-config-file).  

    DeepSpeed can be also configured via [`DeepSpeedPlugin`], e.g., `DeepSpeedPlugin.zero_stage` is equivalent of `--zero_stage`, and `DeepSpeedPlugin.hf_ds_config` can be used to pass `--deepeed_config_file.`

</Tip>

<Tip>

    FSDP can be also configured via [`FullyShardedDataParallelPlugin`], e.g., `FullyShardedDataParallelPlugin.sharding_strategy` is equivalent of `--fsdp_sharding_strategy`.
    
</Tip>

### Checkpointing

Do note that while FSDP can be configured via `--fsdp_state_dict_type` to save either full / sharded checkpoints.

<Tip>

    For DeepSpeed Zero3, one could pass a `--zero3_save_16bit_model true`, which conveniently consolidates the model to a single rank and saves; this is the FSDP equivalent of `fsdp_state_dict_type: FULL_STATE_DICT`. 

</Tip>

<Tip warning={true}>

    For large models, consolidating the model to a single rank can be very slow.

</Tip>

<Tip>

    For quicker checkpointing, for FSDP use `fsdp_state_dict_type: SHARDED_STATE_DICT`, and for DeepSpeed Zero3 [use the `zero_to_fp32.py` script to post-convert sharded checkpoints](https://www.deepspeed.ai/tutorials/zero/#extracting-weights).


</Tip>

### Offloading

FSDP only allows *all-or-nothing* offload (i.e., either offload parameters, gradients, and optimizer, or keep them all in GPU), but DeepSpeed can offload parameters and optimizer differently. Furthermore, DeepSpeed also supports [offloading to NVME](https://www.deepspeed.ai/docs/config-json/#parameter-offloading).

### Prefetching

FSDP allows two prefetching configurations `--fsdp_forward_prefetch` and `--fsdp_backward_prefetch` to improve overlap of comms / computation at a cost of extra memory, see [FSDP documentation](https://pytorch.org/docs/stable/fsdp.html). 
For DeepSpeed, the prefetching will be turned on when needed, and it turns on depending on certain hyper-params like `stage3_param_persistence_threshold`, `stage3_max_reuse_distance`, etc, [that can be configured for Zero3](https://www.deepspeed.ai/docs/config-json/#parameter-offloading); `accelerate` may set these hyper-params automatically if you don't set those explicitly in the deepspeed config file.

<Tip>

    For FSDP set `fsdp_backward_prefetch: BACKWARD_PRE` for improved throughputs if memory allows.

</Tip>

### Model Loading

While FSDP require an explicit `--fsdp_cpu_ram_efficient_loading true` to activate efficient model loading, `transformers` will activate the similar feature whenever DeepSpeed Zero3 is used.

<Tip>

    For FSDP, whenever setting `--fsdp_cpu_ram_efficient_loading true`, `accelerate` will automatically set `sync_module_states` to true. 
    For RAM efficient loading the weights will be loaded only in a single rank, and thus requires `sync_module_states` to broadcast weights to other ranks.

</Tip>

### Model

FSDP requires an explicit `--fsdp_auto_wrap_policy` for the algorithm to decide how to schedule the all-gather and reduce-scatter operations. But for DeepSpeed this is transparent to the user.

<Tip>

    For FSDP, simply set `fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP`. With the latest [`transformers`] versions, we try our best to figure out the suitable `fsdp_transformer_layer_cls_to_wrap` for HF transformers models. However, if you get an error regarding it, please specify this.

</Tip>

### Parameters Summoning

FSDP requires an explicit `--fsdp_use_orig_params` flag if using `torch.compile`, see [the pytorch documentation](https://pytorch.org/docs/stable/fsdp.html#module-torch.distributed.fsdp). For DeepSpeed this is transparent to the user.

<Tip>

    For FSDP, when using `torch.compile` please set `fsdp_use_orig_params: True`.

</Tip>


## Training

Deepspeed requires explicit `--gradient_accumulation_steps` and `--gradient_clipping` flags. For FSDP this is transparent to the user.

<Tip>

    When using DeepSpeed, set `gradient_accumulation_steps: "auto"` and `gradient_clipping: "auto"` to automatically pick up values set in the [`Accelerator`] or [`TrainingArguments`] (if using `transformers`).

</Tip>


## On Differences in Data Precision Handling

To discuss how data precision is handled in both FSDP and Deepspeed, it is instructive to first give an overview of how model parameters are handled in these frameworks. Before the model / optimizer parameters are distributed across GPUs, parameter preparation is involved to first "flatten" them to one-dimensional [`torch.Tensor`](https://pytorch.org/docs/stable/tensors.html#torch-tensor). The implementation of FSDP / DeepSpeed varies in the respect of the `dtype` in which these "flattened" parameters are stored, and there are ramifications with regards to how [`torch.Optimizer`](https://pytorch.org/docs/stable/optim.html#module-torch.optim) allocate their `dtype`s. The table below outlines the processes for both frameworks; the "Local" column indicates the process occurring at a per-gpu level, therefore any memory overheads by upcasting should be understood to be amortized by the number of gpus used.

<Tip>

    As a rule of thumb, for stable training with automatic mixed precision, all the trainable parameters have to be in `torch.float32`.

</Tip>

Process | Local | Framework | Details
--|--|--|--
Loading, i.e., [`AutoModel.from_pretrained(..., torch_dtype=torch_dtype)`] |  
Preparation, i.e., creation of "flat params" | ✅ | FSDP<br>DeepSpeed | created in `torch_dtype`.<br> disregards `torch_dtype`, created in `float32`.
Optimizer initialization | ✅ | FSDP<br>DeepSpeed  | creates parameters in `torch_dtype`<br> creates parameters in `float32`
Training Step, i.e, forward, backward, reduction | | FSDP<br>DeepSpeed  | follows [`MixedPrecision`](https://pytorch.org/docs/stable/fsdp.html#torch.distributed.fsdp.MixedPrecision)<br> follows `deepspeed_config_file` mixed precision settings.
Optimizer (Pre-Step) | ✅ | FSDP<br>DeepSpeed | upcasting (if any) to `torch_dtype`<br>upcasted to `float32`
Optimizer (Actual Step) | ✅ | FSDP<br>DeepSpeed  | occurs in `torch_dtype` <br> occurs in `float32`.

<Tip warning={true}>

    Therefore when using DeepSpeed a small number of GPUs, be aware of potentially significant memory overheads due to the upcasting during preparation.

</Tip>

<Tip>

    With FSDP, in the absence of mixed precision, it is possible to operate the [`torch.Optimizer`](https://pytorch.org/docs/stable/optim.html#module-torch.optim) in low precision `torch_dtype`, which may be helpful when using small number of GPUs. 

</Tip>

<Tip warning={true}>

    With mixed precision, FSDP and DeepSpeed will upcast in the model preparation step (c.f. table above). But do note that FSDP will then save checkpoints in the upcasted precision; Deepspeed may still save low precision checkpoints if `--zero3_save_16bit_model` is specified.

</Tip>


To clarify the above table consider the concrete examples below; the optimizer pre- and actual step combined for brevity. With FSDP it is possible to operate in the two modes shown below, but DeepSpeed can only operate in one.

Framework | Model Loading (`torch_dtype`) | Mixed Precision | Preparation (Local) | Training | Optimizer (Local)
--|--|--|--|--|--
FSDP | bf16 | default (none) | bf16 | bf16 | bf16
FSDP | bf16 | bf16 | fp32 | bf16 | fp32
DeepSpeed   | bf16 | bf16 | fp32 | bf16 | fp32


================================================
FILE: docs/source/concept_guides/gradient_synchronization.md
================================================
<!--Copyright 2022 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Gradient synchronization

PyTorch's distributed module operates by communicating back and forth between all of the GPUs in your system.
This communication takes time, and ensuring all processes know the states of each other happens at particular triggerpoints
when using the `ddp` module. 

These triggerpoints are added to the PyTorch model, specifically their `forward()` and `backward()` methods. 
This happens when the model is wrapped with `DistributedDataParallel`:
```python
import torch.nn as nn
from torch.nn.parallel import DistributedDataParallel

model = nn.Linear(10, 10)
ddp_model = DistributedDataParallel(model)
```
In Accelerate this conversion happens automatically when calling [`~Accelerator.prepare`] and passing in your model.

```diff
+ from accelerate import Accelerator
+ accelerator = Accelerator()
  import torch.nn as nn
- from torch.nn.parallel import DistributedDataParallel

  model = nn.Linear(10,10)
+ model = accelerator.prepare(model)
```

## The slowdown in gradient accumulation

You now understand that PyTorch adds hooks to the `forward` and `backward` method of your PyTorch model when 
training in a distributed setup. But how does this risk slowing down your code?

In DDP (distributed data parallel), the specific order in which processes are performed and ran are expected
at specific points and these must also occur at roughly the same time before moving on.

The most direct example is when you update model parameters through
`optimizer.step()`.
Without gradient accumulation, all instances of the model need to have updated
their gradients computed, collated, and updated before moving on to the next
batch of data.
When performing gradient accumulation, you accumulate `n` loss gradients and
skip `optimizer.step()` until `n` batches have been reached. As all training
processes only need to synchronize by the time `optimizer.step()` is called,
without any modification to your training step, this needless inter-process
communication can cause a significant slowdown.

 How can you avoid this overhead?

## Solving the slowdown problem

Since you are skipping model parameter updates when training on these batches, their gradients do not need to be synchronized until the point where `optimizer.step()` is actually called. 
PyTorch cannot automagically tell when you need to do this, but they do provide a tool to help through the [`no_sync`](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel.no_sync) context manager
that is added to your model after converting it to DDP.

Under this context manager, PyTorch will skip synchronizing the gradients when
`.backward()` is called, and the first call to `.backward()` outside this 
context manager will trigger the synchronization. See an example below:
```python
ddp_model, dataloader, optimizer = accelerator.prepare(model, dataloader, optimizer)

for index, batch in enumerate(dataloader):
    inputs, targets = batch
    # Trigger gradient synchronization on the last batch
    if index != (len(dataloader) - 1):
        with ddp_model.no_sync():
            # Gradients only accumulate
            outputs = ddp_model(inputs)
            loss = loss_func(outputs)
            accelerator.backward(loss)
    else:
        # Gradients finally sync
        outputs = ddp_model(inputs)
        loss = loss_func(outputs)
        accelerator.backward(loss)
        optimizer.step()
```

In Accelerate to make this an API that can be called no matter the training device (though it may not do anything if you are not in a distributed system!),
`ddp_model.no_sync` gets replaced with [`~Accelerator.no_sync`] and operates the same way:

```diff
  ddp_model, dataloader, optimizer = accelerator.prepare(model, dataloader, optimizer)

  for index, batch in enumerate(dataloader):
      inputs, targets = batch
      # Trigger gradient synchronization on the last batch
      if index != (len(dataloader)-1):
-         with ddp_model.no_sync():
+         with accelerator.no_sync(model):
              # Gradients only accumulate
              outputs = ddp_model(inputs)
              loss = loss_func(outputs, targets)
              accelerator.backward(loss)
      else:
          # Gradients finally sync
          outputs = ddp_model(inputs)
          loss = loss_func(outputs)
          accelerator.backward(loss)
          optimizer.step()
          optimizer.zero_grad()
```

As you may expect, the [`~Accelerator.accumulate`] function wraps around this conditional check by keeping track of the current batch number, leaving you with the final
gradient accumulation API:

```python
ddp_model, dataloader, optimizer = accelerator.prepare(model, dataloader, optimizer)

for batch in dataloader:
    with accelerator.accumulate(model):
        optimizer.zero_grad()
        inputs, targets = batch
        outputs = model(inputs)
        loss = loss_function(outputs, targets)
        accelerator.backward(loss)
        optimizer.step()
        optimizer.zero_grad()
```

As a result, you should either use *`accelerator.accumulate` or `accelerator.no_sync`* when it comes to API choice. 

## Just how much of a slowdown is there, and easy mistakes you can make

To set up a realistic example, consider the following setup:

* Two single-GPU T4 nodes and one node with two GPUs
* Each GPU is a T4, and are hosted on GCP
* The script used is a modification of the [NLP Example](https://github.com/muellerzr/timing_experiments/blob/main/baseline.py) script
* Batch size per GPU is 16, and gradients are accumulated every 4 steps

All scripts are available in [this repository](https://github.com/muellerzr/timing_experiments).

If not careful about gradient synchronization and GPU communication, a *large* amount of time can be wasted 
from when these GPUs communicate to each other during unnecessary periods.

By how much?

Reference:
- Baseline: uses no synchronization practices discussed here
- `no_sync` improperly: `no_sync` only around the `backward` call, not the `forward`
- `no_sync`: using the `no_sync` pattern properly
- `accumulate`: using [`~Accelerator.accumulate`] properly

Below are the average seconds per batch iterating over 29 batches of data for each setup on both a single node and on the dual-node setup:

|             | Baseline  | `no_sync` improperly | `no_sync` | `accumulate`| 
| :---------: | :-------: | :------------------: | :-------: | :---------: |
| Multi-Node  | 2±0.01s    | 2.13±0.08s | **0.91±0.11s** | **0.91±0.11s** |
| Single Node | 0.50±0.01s | 0.50±0.01s | **0.41±0.015s** | **0.41±0.015s** |

As you can see, if you are not careful about how you set up your gradient synchronization, you can get upwards of more than a 2x slowdown during training!

If you are worried about making sure everything is done properly, we highly recommend utilizing the [`~Accelerator.accumulate`] function and passing in
`gradient_accumulation_steps` or `gradient_accumulation_plugin` to the [`Accelerator`] object so Accelerate can handle this for you.

### `no_sync` requires additional GPU memory when using FSDP

Be aware that not syncing gradients can have adverse effects while performing FSDP training. As it has been warned in `torch`, the [`no_sync` context manager for FSDP](https://pytorch.org/docs/stable/fsdp.html#torch.distributed.fsdp.FullyShardedDataParallel.no_sync) will require additional memory.

Therefore in memory intensive situations while using FSDP, we recommend to set `sync_each_batch` to `True` in the [`~utils.GradientAccumulationPlugin`] to disable `no_sync`.

See the example below where we fine-tune Mixtral (47B parameters) on 8 A100-80GB GPUs. We see that even for a modest `gradient_accumulation_steps=2` we quickly go out-of-memory (OOM) if `no_sync` is enabled. Again, this is due to additional memory overheads due to FSDP's `no_sync`. However, if `no_sync` is disabled via `sync_each_batch=True`, then the memory consumption for `gradient_accumulation_steps=16` reverts to that of `gradient_accumulation_steps=1`.

| Model           | `no_sync` (accum=1) | `no_sync` (accum=2) | `no_sync` disabled (accum=16)
| :-------------: | :-----------------: | :-----------------: | :-----------------: 
mixtral 8x7B      | 69G                 | OOM                 | 69G

> [!WARNING] 
> Disabling `no_sync` means there _will be slowdown_ due the extra data syncs, as explained by the earlier sections of this guide.

================================================
FILE: docs/source/concept_guides/internal_mechanism.md
================================================
<!--Copyright 2021 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Accelerate's internal mechanisms

Internally, Accelerate works by first analyzing the environment in which the script is launched to determine which
kind of distributed setup is used, how many different processes there are and which one the current script is in. All
that information is stored in the [`~AcceleratorState`].

This class is initialized the first time you instantiate an [`~Accelerator`] as well as performing any
specific initialization your distributed setup needs. Its state is then uniquely shared through all instances of
[`~state.AcceleratorState`]. (The same can also be done with the [`PartialState`], a more barebones version it inherits)

Then, when calling [`~Accelerator.prepare`], the library:

- wraps your model(s) in the container adapted for the distributed setup,
- wraps your optimizer(s) in an [`~optimizer.AcceleratedOptimizer`],
- wraps your scheduler(s) in an [`~scheduler.AcceleratedScheduler`]
- creates a new version of your dataloader(s) in a [`~data_loader.DataLoaderShard`] or [`~data_loader.DataLoaderDispatcher`]

While the model(s), optimizer(s), and scheduler(s) are just put in simple wrappers, the dataloader(s) are re-created. This is mostly
because PyTorch does not let the user change the `batch_sampler` of a dataloader once it's been created and the
library handles the sharding of your data between processes by changing that `batch_sampler` to yield every other
`num_processes` batches (if enabled).

The [`~data_loader.DataLoaderShard`] subclasses `DataLoader` to add the following functionality:

- it synchronizes the appropriate random number generator of all processes at each new iteration, to ensure any
  randomization (like shuffling) is done the exact same way across processes.
- it puts the batches on the proper device before yielding them (unless you have opted out of
  `device_placement=True`).
  
The [`~data_loader.DataLoaderDispatcher`] subclasses differs from the [`~data_loader.DataLoaderShard`] in that when iterating through the `DataLoader`, the data is all starting from process 0 and *then* split and sent off to each process rather than it happening at the dataset level.

The random number generator synchronization will by default synchronize:

- the `generator` attribute of a given sampler (like the PyTorch `RandomSampler`) for PyTorch >= 1.6
- the main random number generator in PyTorch <=1.5.1

You can choose which random number generator(s) to synchronize with the `rng_types` argument of the main
[`Accelerator`]. In PyTorch >= 1.6, it is recommended to rely on a local `generator` to avoid
setting the same seed in the main random number generator in all processes.

<Tip warning={true}>

    Synchronization of the main torch (or CUDA or XLA) random number generator will affect any other potential random
    artifacts you could have in your dataset (like random data augmentation) in the sense that all processes will get
    the same random numbers from the torch random modules (so will apply the same random data augmentation if it's
    controlled by torch).

</Tip>

<Tip>

    The randomization part of your custom sampler, batch sampler or iterable dataset should be done using a local
    `torch.Generator` object (in PyTorch >= 1.6), see the traditional `RandomSampler`, as an example.

</Tip>

If you have [`torchdata>=0.8.0`](https://github.com/pytorch/data/tree/main) installed, and you have passed `use_stateful_dataloader=True` into your [`~utils.DataLoaderConfiguration`], these classes will directly inherit from `StatefulDataLoader` instead, and maintain a `state_dict`.

For more details about the internals, see the [Internals page](../package_reference/torch_wrappers).


================================================
FILE: docs/source/concept_guides/low_precision_training.md
================================================
<!--Copyright 2023 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Low precision training methods

The release of new kinds of hardware led to the emergence of new training paradigms that better utilize them. Currently, this is in the form of training
in 8-bit precision using packages such as [TransformersEngine](https://github.com/NVIDIA/TransformerEngine) (TE), [torchao](https://github.com/pytorch/ao) (native PyTorch FP8), or the legacy [MS-AMP](https://github.com/Azure/MS-AMP/tree/main) (no longer maintained, see warning below).

For an introduction to the topics discussed today, we recommend reviewing the [low-precision usage guide](../usage_guides/low_precision_training) as this documentation will reference it regularly. 

## A Quick Chart

Below is a quick chart from the MS-AMP documentation showing the different bit-precisions for each solution during training:

Optimization Level | Computation(GEMM) | Comm | Weight | Master Weight | Weight Gradient | Optimizer States
-- | -- | -- | -- | -- | -- | --
FP16 AMP | FP16 | FP32 | FP32 | N/A | FP32 | FP32+FP32
Nvidia TE | FP8 | FP32 | FP32 | N/A | FP32 | FP32+FP32
MS-AMP O1 | FP8 | FP8 | FP16 | N/A | FP8 | FP32+FP32
MS-AMP O2 | FP8 | FP8 | FP16 | N/A | FP8 | FP8+FP16
MS-AMP O3 | FP8 | FP8 | FP8 | FP16 | FP8 | FP8+FP16

## `TransformersEngine`

`TransformersEngine` is the first solution to trying to train in 8-bit floating point. It works by using drop-in replacement layers for certain ones in a model that utilizes their FP8-engine to reduce the number of bits (such as 32 to 8) without degrading the final accuracy of the model. 

Specifically, Accelerate will find and replace the following layers with `TransformersEngine` versions:

* `nn.LayerNorm` for `te.LayerNorm`
* `nn.Linear` for `te.Linear`

As a result we wind up with a model that has most of its layers in BF16, while some layers are in FP8 reducing some of the memory. 

Anecdotally, we have noticed that performance gains don't really start showing when using `TransformerEngine` until a large majority of the layers
in the model are made up of those two layers to replace. As a result, only larger models have shown performance improvements when the number of parameters is around and upwards of a few billion. 

The `TransformerEngine` can receive many different arguments that customize how it performs FP8 calculations and what they do. A full list of the arguments is available below:

* `margin`: The margin to use for the gradient scaling.
* `interval`: The interval to use for how often the scaling factor is recomputed.
* `fp8_format``: The format to use for the FP8 recipe. Must be one of `HYBRID` or `E4M3`. (Generally `HYBRID` for training, `E4M3` for evaluation)
* `amax_history_len`: The length of the history to use for the scaling factor computation
* `amax_compute_algo`: The algorithm to use for the scaling factor computation. Must be one of `max` or `most_recent`.
* `override_linear_precision`: Whether or not to execute `fprop`, `dgrad`, and `wgrad` GEMMS in higher precision.

You can customize each of these as part of [`utils.FP8RecipeKwargs`] to help optimize performance of your models.

If we notice in the chart mentioned earlier, TE simply casts the computation layers into FP8, while everything else is in FP32. As a result this winds up utilizing the most memory but does so with the benefit of guaranteeing the least amount of loss in end accuracy during training. 

## `MS-AMP`

<Tip warning={true}>

**⚠️ Deprecated / Unmaintained:** MS-AMP is no longer actively maintained by Microsoft. The repository has not seen updates since 2023 and has known compatibility issues with CUDA 12.x+, modern NCCL versions, and recent PyTorch releases (2.2+). **We strongly recommend using `TransformersEngine` or `torchao` instead.** See the [usage guide](../usage_guides/low_precision_training) for migration instructions.

</Tip>

MS-AMP takes a different approach to `TransformersEngine` by providing three different optimization levels to convert more operations in FP8 or FP16.

* The base optimization level (`O1`), passes communications of the weights (such as in DDP) in FP8, stores the weights of the model in FP16, and leaves the optimizer states in FP32. The main benefit of this optimization level is that we can reduce the communication bandwidth by essentially half. Additionally, more GPU memory is saved due to 1/2 of everything being cast in FP8, and the weights being cast to FP16. Notably, both the optimizer states remain in FP32.

* The second optimization level (`O2`) improves upon this by also reducing the precision of the optimizer states. One is in FP8 while the other is in FP16. Generally it's been shown that this will only provide a net-gain of no degraded end accuracy, increased training speed, and reduced memory as now every state is either in FP16 or FP8. 

* Finally, MS-AMP has a third optimization level (`O3`) which helps during DDP scenarios such as DeepSpeed. The weights of the model in memory are fully cast to FP8, and the master weights are now stored in FP16. This fully reduces memory by the highest factor as now not only is almost everything in FP8, only two states are left in FP16. Currently, only DeepSpeed versions up through 0.9.2 are supported, so this capability is not included in the Accelerate integration

## Combining the two

<Tip warning={true}>

Since MS-AMP is no longer maintained, this combination is not recommended for new projects.

</Tip>

More experiments need to be performed but it's been noted that combining both MS-AMP and TransformersEngine can lead to the highest throughput by relying on NVIDIA's optimized FP8 operators and utilizing how MS-AMP reduces the memory overhead.


================================================
FILE: docs/source/concept_guides/performance.md
================================================
<!--Copyright 2022 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Comparing performance across distributed setups

Evaluating and comparing the performance from different setups can be quite tricky if you don't know what to look for.
For example, you cannot run the same script with the same batch size across TPU, multi-GPU, and single-GPU with Accelerate 
and expect your results to line up. 

But why?

There are three reasons for this that this tutorial will cover: 

1. **Setting the right seeds**
2. **Observed Batch Sizes**
3. **Learning Rates**

## Setting the Seed 

While this issue has not come up as much, make sure to use [`utils.set_seed`] to fully set the seed in all distributed cases so training will be reproducible:

```python
from accelerate.utils import set_seed

set_seed(42)
```

Why is this important? Under the hood this will set **5** different seed settings:

```python
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed) # or torch.xpu.manual_seed_all, etc
    # ^^ safe to call this function even if cuda is not available
    if is_torch_xla_available():
        xm.set_rng_state(seed)
```

The random state, numpy's state, torch, torch's device state, and if TPUs are available torch_xla's cuda state.

## Observed Batch Sizes 

When training with Accelerate, the batch size passed to the dataloader is the **batch size per GPU**. What this entails is 
a batch size of 64 on two GPUs is truly a batch size of 128. As a result, when testing on a single GPU this needs to be accounted for,
as well as similarly for TPUs. 

The below table can be used as a quick reference to try out different batch sizes:

<Tip>

In this example, there are two GPUs for "Multi-GPU" and a TPU pod with 8 workers

</Tip>

| Single GPU Batch Size | Multi-GPU Equivalent Batch Size | TPU Equivalent Batch Size |
|-----------------------|---------------------------------|---------------------------|
| 256                   | 128                             | 32                        |
| 128                   | 64                              | 16                        |
| 64                    | 32                              | 8                         |
| 32                    | 16                              | 4                         |

## Learning Rates 

As noted in multiple sources[[1](https://aws.amazon.com/blogs/machine-learning/scalable-multi-node-deep-learning-training-using-gpus-in-the-aws-cloud/)][[2](https://docs.nvidia.com/clara/clara-train-sdk/pt/model.html#classification-models-multi-gpu-training)], the learning rate should be scaled *linearly* based on the number of devices present. The below 
snippet shows doing so with Accelerate:

<Tip>

Since users can have their own learning rate schedulers defined, we leave this up to the user to decide if they wish to scale their 
learning rate or not.
 
</Tip>

```python
learning_rate = 1e-3
accelerator = Accelerator()
learning_rate *= accelerator.num_processes

optimizer = AdamW(params=model.parameters(), lr=learning_rate)
```

You will also find that `accelerate` will step the learning rate based on the number of processes being trained on. This is because 
of the observed batch size noted earlier. So in the case of 2 GPUs, the learning rate will be stepped twice as often as a single GPU
to account for the batch size being twice as large (if no changes to the batch size on the single GPU instance are made).

## Gradient Accumulation and Mixed Precision

When using gradient accumulation and mixed precision, due to how gradient averaging works (accumulation) and the precision loss (mixed precision), 
some degradation in performance is expected. This will be explicitly seen when comparing the batch-wise loss between different compute 
setups. However, the overall loss, metric, and general performance at the end of training should be _roughly_ the same.


================================================
FILE: docs/source/concept_guides/sequence_parallelism.md
================================================
<!--Copyright 2025 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Sequence parallel in 🤗`accelerate`

This guide will cover basics of using sequence parallelism in 🤗`accelerate`.

See also the very related [Context Parallellism](./context_parallelism.md).

## Why sequence parallelism?

With the advent of large language models, and recently reasoning models, the sequence length has been growing rapidly. This, combined with quadratic memory complexity of attention, has led to a need for more efficient ways to train models with long sequences.
With sequence length of 128k, the memory requirement of the attention matrix is `128k * 128k * 2 bytes * num_heads = ~32 GB * num_heads` for `bf16` precision, given vanilla attention implementation. Granted, with usage of `flash attention` or `SDPA` which do not materialize these attention weights, this decreases drastically, but the growth in memory requirements is still considerable.

Ulysses Sequence parallelism allows us to shard the inputs to the attention computation along the sequence dimension and compute the attention normally, but using only a slice of attention heads on each GPU. With this, we can train models with long sequences, with a few more tools, scaling to 15M+ sequence length. To see how to augment Ulysses SP with TiledMLP, Liger-Kernel, Activation checkpoint offload to cpu and a few other tricks pleae refer to the paper: [Arctic Long Sequence Training: Scalable And Efficient Training For Multi-Million Token Sequences](https://arxiv.org/abs/2506.13996).

## How is Ulysses SP different from FSDP CP

In the document [Context Parallellism](./context_parallelism.md) you can learn about deploying another technology called Context Parallelism, which too slices on the sequence dimension but uses Ring Attention instead of slicing on the head dimension.

The following articles go into a very detailed explanation of the differences between the two technologies:
- https://insujang.github.io/2024-01-11/tensor-parallelism-and-sequence-parallelism-detailed-analysis/
- https://huggingface.co/blog/exploding-gradients/ulysses-ring-attention

A quick summary adapting from one of the articles:
- Ulysses SP has a relatively low communication overhead, but is limited by the number of Attention Heads and thus it has certain requirements for network topology (number of attention heads has has to be divisible by the number of participating gpus for a single replica). All-to-all communication is sensitive to latency and it requires Deepspeed.
- FSDP CP Ring-Attention's P2P ring communication has no aforementioned divisibilty requirements, but has a higher communication volume.

Finally it should be possible to combine SP + CP as explained in the paper [USP: A Unified Sequence Parallelism Approach for Long Context Generative AI](https://arxiv.org/abs/2405.07719) to support an even longer sequence length, albeit this is not yet integrated into 🤗`accelerate`.


## Supported sequence parallelism backends

Currently the only sequence parallelism backend is `deepspeed`, which comes from the modernized Ulysses SP which is part of the [Arctic Long Sequence Training technology](https://arxiv.org/abs/2506.13996). There is also a [tutorial](https://www.deepspeed.ai/tutorials/ulysses-alst-sequence-parallelism/) should you want to integrate it into your own code directly.

## How to use sequence parallelism?

```diff
from accelerate.utils import ParallelismConfig, DeepSpeedSequenceParallelConfig

+# Example: 4 GPUs with sp_size=4, dp_shard_size=1
+# Ensure: dp_replicate_size × dp_shard_size × sp_size = 1 × 1 × 4 = 4 GPUs
parallelism_config = ParallelismConfig(
+     sp_backend="deepspeed",
+     sp_size=4,
+     dp_shard_size=1,  # Explicit: no data parallelism
+     sp_handler=DeepSpeedSequenceParallelConfig(
+         sp_seq_length_is_variable: true,
+         sp_attn_implementation="sdpa",
+     ),
+ )

accelerator = Accelerator(
    ...,
    parallelism_config=parallelism_config,
)
```

As with any other feature in 🤗`accelerate`, you can enable sequence parallelism also by passing the corresponding flags to `accelerate launch`. In this case, it's no different:

```bash
accelerate launch --parallelism-config-sp-size 8  ...
```

> [!Tip]
> You can also set the `sp_size` and other configuration in the `accelerate config` command, which will save them in your `accelerate` configuration file, so you don't have to pass them every time you launch your script.

> [!Tip]
> sequence parallelism combines with data parallelism. It doesn't require additional GPUs.
> So if you have 8 gpus you can do: `--parallelism-config-dp-shard-size 8 --parallelism-config-sp-size 8`. Or you can use the `ParallelismConfig` class to set them programmatically.
>
> **Important**: You must ensure `dp_replicate_size × dp_shard_size × sp_size = num_processes`. For example, with 8 GPUs and `sp_size=8`, you need `dp_shard_size=1` (since 1 × 1 × 8 = 8). With 4 GPUs and `sp_size=2`, you could use `dp_shard_size=2` (since 1 × 2 × 2 = 4) for 2D parallelism.


## ALST/Ulysses SP backend configuration

ALST/UlyssesSP implements sequence parallelism using attention head parallelism, as explained in [this paper](https://arxiv.org/abs/2506.13996). For simplicity, we reuse the concept and setup of sequence parallelism, which, from the user's perspective, is the same: multiple GPUs are used to process a single batch.

To give a sense of what ALST made possible - it allowed us to train in bf16 with 500K tokens on a single H100 GPU, 3.7M on a single node, and 15M on Llama-8B using just four nodes. This feature of HF Accelerate enables only 1 of the 3 ALST components, so the achievable sequence length will be smaller. You'd want TiledMLP, Activation checkpoint offload to CPU, and a few other things enabled to get the full power of ALST. For details, please refer to [this tutorial](https://www.deepspeed.ai/tutorials/ulysses-alst-sequence-parallelism/).

To configure the `deepspeed` backend:

```python
# Example: 4 GPUs with sp_size=4, dp_shard_size=1
# Ensure: dp_replicate_size × dp_shard_size × sp_size = 1 × 1 × 4 = 4 GPUs
parallelism_config = ParallelismConfig(
    sp_backend="deepspeed",
    sp_size=4,
    dp_shard_size=1,  # Explicit: no data parallelism
    sp_handler=DeepSpeedSequenceParallelConfig(
        sp_seq_length=256,
        sp_seq_length_is_variable=True,
        sp_attn_implementation="sdpa",
    ),
)
accelerator = Accelerator(
    ...,
    parallelism_config=parallelism_config,
)
```

- `sp_backend`: set to `deepspeed` here
- `sp_size` is the degree of the sequence parallelism - in the above example it's 4, therefore 4 gpus will be used to process a single batch (while doing DP=4 over the same gpus)
- `sp_seq_length` and `sp_seq_length_is_variable` are used to deal with sequence lengths. If `sp_seq_length_is_variable=True` the backend will work with a sequence length that may change between batches, in which case `sp_seq_length` value can be set to anything divisible by the sequence parallel degree or not set at all. In this case on every `forward` the sequence variables will be derived from input. If `False` then `seq_length` needs to match the batch's sequence length dimension, which then will have to be padded to be always the same. The default is `True`.
- `sp_attn_implementation` is one of `sdpa`, `flash_attention_2` or `flash_attention_3`. This sequence parallel implementation uses `position_ids` instead of `attention_mask` therefore, `eager` can't work here until it supports working with `position_ids`. Also, please note that `sdpa` doesn't handle multiple samples combined into one correctly; it will attend to the whole sample as one. If the samples aren't combined, `sdpa` will work correctly. Therefore, Flash Attention should be the ideal choice as it always works.

Instead of setting these values in `DeepSpeedSequenceParallelConfig` object, you can also use the environment variables to accomplish the same - here they are correspondingly to the end of the list above.
- `PARALLELISM_CONFIG_SP_BACKEND`
- `PARALLELISM_CONFIG_SP_SEQ_LENGTH`
- `PARALLELISM_CONFIG_SP_SEQ_LENGTH_IS_VARIABLE`
- `PARALLELISM_CONFIG_SP_ATTN_IMPLEMENTATION`

If not passed in the code, `sp_size` can be set via `--parallelism_config_sp_size` CLI argument. Same for other arguments. You can also do the accelerate config file style config, e.g., for 2 GPUs:

```yaml
distributed_type: DEEPSPEED
deepspeed_config:
  deepspeed_config_file: path/to/ds_config.json
machine_rank: 0
num_machines: 1
num_processes: 2
parallelism_config:
  parallelism_config_dp_replicate_size: 1
  parallelism_config_dp_shard_size: 1  # Must satisfy: 1 × 1 × 2 = 2 num_processes
  parallelism_config_sp_size: 2
  parallelism_config_sp_backend: deepspeed
  parallelism_config_sp_seq_length_is_variable: true
  parallelism_config_sp_attn_implementation: sdpa

```

As mentioned earlier Ulysses sequence parallelism is normally overlayed with data parallelism - same ranks are used for feeding unique data streams and also perform Ulysses Sequence Parallelism. But you could also create replicas like so:

```python
# Example: 4 GPUs with 2D parallelism (SP=2, DP=2)
# Ensure: dp_replicate_size × dp_shard_size × sp_size = 2 × 1 × 2 = 4 GPUs
parallelism_config = ParallelismConfig(
    dp_replicate_size=2,
    dp_shard_size=1,  # Explicit: no sharding within replicas
    sp_size=2,
    sp_backend="deepspeed",
    sp_handler=DeepSpeedSequenceParallelConfig(...),
)
```
Here we use 4 gpus, with 2 sequence parallelism replicas. Deepspeed-ZeRO is what drives the data parallelism here.

Please note that a lot of magic is hidden inside [UlyssesSPDataLoaderAdapter](https://github.com/deepspeedai/DeepSpeed/blob/64c0052fa08438b4ecf4cae30af15091a92d2108/deepspeed/runtime/sequence_parallel/ulysses_sp.py#L442). It's used behind the scenes, wrapping your original DataLoader object, but you should be aware of it should you run into any problems. It also automatically injects the correct `shift_labels` into the batch dictionary, before the batch gets sharded across the participating ranks.

Now the only remaining piece to start using ALST/UlyssesSP is to aggregate the loss across ranks using a differentiable `all_gather` to get the grads right. The following code does it, while also excluding any masked out with `-100` tokens, to get the correct average:

```python
sp_size = parallelism_config.sp_size if parallelism_config is not None else 1
if sp_size > 1:
    sp_group = accelerator.torch_device_mesh["sp"].get_group()
    sp_world_size = parallelism_config.sp_size

# Normal training loop
for iter, batch in enumerate(dl):
    optimizer.zero_grad()

    batch = move_to_device(batch, model.device)

    # The model automatically receives shift_labels via **kwargs and uses it for loss computation.
    # Both standard transformers models and Liger-patched models handle this correctly.
    outputs = model(**batch)
    loss = outputs.loss
    shift_labels = batch["shift_labels"]

    if sp_size > 1:
        # differentiable weighted per-shard-loss aggregation across ranks
        losses_per_rank = torch.distributed.nn.functional.all_gather(loss, group=sp_group)
        # special dealing with SFT that has prompt tokens that aren't used in loss computation
        good_tokens = (shift_labels != -100).view(-1).sum()
        good_tokens_per_rank = torch.distributed.nn.functional.all_gather(
            good_tokens, group=sp_group
        )
        # Skip ranks with zero valid tokens to avoid NaN contamination (NaN * 0 = NaN)
        total_loss = sum(
            losses_per_rank[rank] * good_tokens_per_rank[rank]
            for rank in range(sp_world_size)
            if good_tokens_per_rank[rank] > 0
        )
        total_good_tokens = sum(good_tokens_per_rank)
        loss = total_loss / max(total_good_tokens, 1)

    if rank == 0: accelerator.print(f"{iter}: {loss=}")
    accelerator.log(dict(train_loss=loss, step=iter))

    accelerator.backward(loss)
    optimizer.step()
```

Note that models automatically handle `shift_labels` when it's present in the batch. The model's forward pass receives `shift_labels` via `**kwargs` and passes it to the loss function, which correctly computes the loss for sequence parallelism. If you use [Liger Kernel](https://github.com/linkedin/Liger-Kernel), it also handles `shift_labels` seamlessly and computes loss in a very memory-efficient way. Liger is highly recommended for long sequence lengths, as it liberates GPU memory by using fused operations (e.g., fused logit-loss computation that never materializes the full logits tensor in memory).

If you want to see what HF Accelerate did behind the scenes please read [this full integration tutorial](https://www.deepspeed.ai/tutorials/ulysses-alst-sequence-parallelism/).

For an example of an Accelerate training loop with enabled ALST/UlyssesSP see [examples/alst_ulysses_sequence_parallelism](https://github.com/huggingface/accelerate/blob/main/examples/alst_ulysses_sequence_parallelism).

[!Warning]
> This API is quite new and still in its experimental stage. While we strive to provide a stable API, some small parts of the public API may change in the future.

Since this is a Deepspeed backend the usual Deepspeed configuration applies, so you can combine sequence parallelism with optimizer states and/or weights offloading as well to liberate more gpu memory and enable an even longer sequence length. This technology has been tested to work with DeepSpeed ZeRO stage 2 and 3.


================================================
FILE: docs/source/concept_guides/training_tpu.md
================================================
<!--Copyright 2022 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Training on TPUs

Training on TPUs can be slightly different from training on multi-gpu, even with Accelerate. This guide aims to show you 
where you should be careful and why, as well as the best practices in general.

## Training in a Notebook

The main carepoint when training on TPUs comes from the [`notebook_launcher`]. As mentioned in the [notebook tutorial](../usage_guides/notebook), you need to 
restructure your training code into a function that can get passed to the [`notebook_launcher`] function and be careful about not declaring any tensors on the GPU.

While on a TPU that last part is not as important, a critical part to understand is that when you launch code from a notebook you do so through a process called **forking**. 
When launching from the command-line, you perform **spawning**, where a python process is not currently running and you *spawn* a new process in. Since your Jupyter notebook is already 
utilizing a python process, you need to *fork* a new process from it to launch your code. 

Where this becomes important is in regard to declaring your model. On forked TPU processes, it is recommended that you instantiate your model *once* and pass this into your 
training function. This is different than training on GPUs where you create `n` models that have their gradients synced and back-propagated at certain moments. Instead, one 
model instance is shared between all the nodes and it is passed back and forth. This is important especially when training on low-resource TPUs such as those provided in Kaggle kernels or
on Google Colaboratory. 

Below is an example of a training function passed to the [`notebook_launcher`] if training on CPUs or GPUs:

<Tip>

    This code snippet is based off the one from the `simple_nlp_example` notebook found [here](https://github.com/huggingface/notebooks/blob/main/examples/accelerate_examples/simple_nlp_example.ipynb) with slight 
    modifications for the sake of simplicity

</Tip>

```python
def training_function():
    # Initialize accelerator
    accelerator = Accelerator()
    model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
    train_dataloader, eval_dataloader = create_dataloaders(
        train_batch_size=hyperparameters["train_batch_size"], eval_batch_size=hyperparameters["eval_batch_size"]
    )

    # Instantiate optimizer
    optimizer = AdamW(params=model.parameters(), lr=hyperparameters["learning_rate"])

    # Prepare everything
    # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
    # prepare method.
    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader
    )

    num_epochs = hyperparameters["num_epochs"]
    # Now we train the model
    for epoch in range(num_epochs):
        model.train()
        for step, batch in enumerate(train_dataloader):
            outputs = model(**batch)
            loss = outputs.loss
            accelerator.backward(loss)

            optimizer.step()
            optimizer.zero_grad()
```

```python
from accelerate import notebook_launcher

notebook_launcher(training_function)
```

<Tip>

    The `notebook_launcher` will default to 8 processes if Accelerate has been configured for a TPU

</Tip>

If you use this example and declare the model *inside* the training loop, then on a low-resource system you will potentially see an error 
like:

```
ProcessExitedException: process 0 terminated with signal SIGSEGV
```

This error is *extremely* cryptic but the basic explanation is you ran out of system RAM. You can avoid this entirely by reconfiguring the training function to 
accept a single `model` argument, and declare it in an outside cell:

```python
# In another Jupyter cell
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
```

```diff
+ def training_function(model):
      # Initialize accelerator
      accelerator = Accelerator()
-     model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
      train_dataloader, eval_dataloader = create_dataloaders(
          train_batch_size=hyperparameters["train_batch_size"], eval_batch_size=hyperparameters["eval_batch_size"]
      )
  ...
```

And finally calling the training function with:

```diff
  from accelerate import notebook_launcher
- notebook_launcher(training_function)
+ notebook_launcher(training_function, (model,))
```

<Tip>

    The above workaround is only needed when launching a TPU instance from a Jupyter Notebook on a low-resource server such as Google Colaboratory or Kaggle. If 
    using a script or launching on a much beefier server declaring the model beforehand is not needed.

</Tip>

## Mixed Precision and Global Variables 

As mentioned in the [mixed precision tutorial](../usage_guides/mixed_precision), Accelerate supports fp16 and bf16, both of which can be used on TPUs.
That being said, ideally `bf16` should be utilized as it is extremely efficient to use.

There are two "layers" when using `bf16` and Accelerate on TPUs, at the base level and at the operation level. 

At the base level, this is enabled when passing `mixed_precision="bf16"` to `Accelerator`, such as:
```python
accelerator = Accelerator(mixed_precision="bf16")
```
By default, this will cast `torch.float` and `torch.double` to `bfloat16` on TPUs. 
The specific configuration being set is an environmental variable of `XLA_USE_BF16` is set to `1`.

There is a further configuration you can perform which is setting the `XLA_DOWNCAST_BF16` environmental variable. If set to `1`, then 
`torch.float` is `bfloat16` and `torch.double` is `float32`.

This is performed in the `Accelerator` object when passing `downcast_bf16=True`:
```python
accelerator = Accelerator(mixed_precision="bf16", downcast_bf16=True)
```

Using downcasting instead of bf16 everywhere is good for when you are trying to calculate metrics, log values, and more where raw bf16 tensors would be unusable. 

## Training Times on TPUs

As you launch your script, you may notice that training seems exceptionally slow at first. This is because TPUs
first run through a few batches of data to see how much memory to allocate before finally utilizing this configured 
memory allocation extremely efficiently. 

If you notice that your evaluation code to calculate the metrics of your model takes longer due to a larger batch size being used, 
it is recommended to keep the batch size the same as the training data if it is too slow. Otherwise the memory will reallocate to this 
new batch size after the first few iterations. 

<Tip>

    Just because the memory is allocated does not mean it will be used or that the batch size will increase when going back to your training dataloader.

</Tip>


================================================
FILE: docs/source/index.md
================================================
<!--Copyright 2022 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Accelerate

Accelerate is a library that enables the same PyTorch code to be run across any distributed configuration by adding just four lines of code! In short, training and inference at scale made simple, efficient and adaptable.

```diff
+ from accelerate import Accelerator
+ accelerator = Accelerator()

+ model, optimizer, training_dataloader, scheduler = accelerator.prepare(
+     model, optimizer, training_dataloader, scheduler
+ )

  for batch in training_dataloader:
      optimizer.zero_grad()
      inputs, targets = batch
      inputs = inputs.to(device)
      targets = targets.to(device)
      outputs = model(inputs)
      loss = loss_function(outputs, targets)
+     accelerator.backward(loss)
      optimizer.step()
      scheduler.step()
```

Built on `torch_xla` and `torch.distributed`, Accelerate takes care of the heavy lifting, so you don't have to write any custom code to adapt to these platforms.
Convert existing codebases to utilize [DeepSpeed](usage_guides/deepspeed), perform [fully sharded data parallelism](usage_guides/fsdp), and have automatic support for mixed-precision training! 

<Tip> 

  To get a better idea of this process, make sure to check out the [Tutorials](basic_tutorials/overview)! 

</Tip>


This code can then be launched on any system through Accelerate's CLI interface:
```bash
accelerate launch {my_script.py}
```

<div class="mt-10">
  <div class="w-full flex flex-col space-y-4 md:space-y-0 md:grid md:grid-cols-2 md:gap-y-4 md:gap-x-5">
    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./basic_tutorials/overview"
      ><div class="w-full text-center bg-gradient-to-br from-blue-400 to-blue-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Tutorials</div>
      <p class="text-gray-700">Learn the basics and become familiar with using Accelerate. Start here if you are using Accelerate for the first time!</p>
    </a>
    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./usage_guides/explore"
      ><div class="w-full text-center bg-gradient-to-br from-indigo-400 to-indigo-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">How-to guides</div>
      <p class="text-gray-700">Practical guides to help you achieve a specific goal. Take a look at these guides to learn how to use Accelerate to solve real-world problems.</p>
    </a>
    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./concept_guides/gradient_synchronization"
      ><div class="w-full text-center bg-gradient-to-br from-pink-400 to-pink-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Conceptual guides</div>
      <p class="text-gray-700">High-level explanations for building a better understanding of important topics such as avoiding subtle nuances and pitfalls in distributed training and DeepSpeed.</p>
   </a>
    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./package_reference/accelerator"
      ><div class="w-full text-center bg-gradient-to-br from-purple-400 to-purple-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Reference</div>
      <p class="text-gray-700">Technical descriptions of how Accelerate classes and methods work.</p>
    </a>
  </div>
</div>


================================================
FILE: docs/source/package_reference/accelerator.md
================================================
<!--Copyright 2021 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Accelerator

The [`Accelerator`] is the main class for enabling distributed training on any type of training setup. Read the [Add Accelerator to your code](../basic_tutorials/migration) tutorial to learn more about how to add the [`Accelerator`] to your script.

## Accelerator[[api]]

[[autodoc]] Accelerator

## Utilities

[[autodoc]] accelerate.utils.gather_object


================================================
FILE: docs/source/package_reference/big_modeling.md
================================================
<!--Copyright 2021 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Working with large models

## Dispatch and offload

### init_empty_weights

[[autodoc]] big_modeling.init_empty_weights

### cpu_offload

[[autodoc]] big_modeling.cpu_offload

### cpu_offload_with_hook

[[autodoc]] big_modeling.cpu_offload_with_hook

### disk_offload

[[autodoc]] big_modeling.disk_offload

### dispatch_model

[[autodoc]] big_modeling.dispatch_model

### load_checkpoint_and_dispatch

[[autodoc]] big_modeling.load_checkpoint_and_dispatch

### load_checkpoint_in_model

[[autodoc]] big_modeling.load_checkpoint_in_model

### infer_auto_device_map

[[autodoc]] utils.infer_auto_device_map

## Hooks

### ModelHook

[[autodoc]] hooks.ModelHook

### AlignDevicesHook

[[autodoc]] hooks.AlignDevicesHook

### SequentialHook

[[autodoc]] hooks.SequentialHook

### LayerwiseCastingHook

[[autodoc]] hooks.LayerwiseCastingHook

## Adding Hooks

### add_hook_to_module

[[autodoc]] hooks.add_hook_to_module

### attach_execution_device_hook

[[autodoc]] hooks.attach_execution_device_hook

### attach_align_device_hook

[[autodoc]] hooks.attach_align_device_hook

### attach_align_device_hook_on_blocks

[[autodoc]] hooks.attach_align_device_hook_on_blocks

### attach_layerwise_casting_hooks

[[autodoc]] big_modeling.attach_layerwise_casting_hooks

## Removing Hooks

### remove_hook_from_module

[[autodoc]] hooks.remove_hook_from_module

### remove_hook_from_submodules

[[autodoc]] hooks.remove_hook_from_submodules

## Utilities

### has_offloaded_params

[[autodoc]] utils.has_offloaded_params

### align_module_device

[[autodoc]] utils.align_module_device


================================================
FILE: docs/source/package_reference/cli.md
================================================
<!--Copyright 2021 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# The Command Line 

Below is a list of all the available commands 🤗 Accelerate with their parameters

## accelerate config

**Command**:

`accelerate config` or `accelerate-config`

Launches a series of prompts to create and save a `default_config.yml` configuration file for your training system. Should 
always be ran first on your machine.

**Usage**: 

```bash
accelerate config [arguments]
```

**Optional Arguments**:
* `--config_file CONFIG_FILE` (`str`) -- The path to use to store the config file. Will default to a file named default_config.yaml in the cache location, which is the content
                        of the environment `HF_HOME` suffixed with 'accelerate', or if you don't have such an environment variable, your cache directory
                        (`~/.cache` or the content of `XDG_CACHE_HOME`) suffixed with `huggingface`.
* `-h`, `--help` (`bool`) -- Show a help message and exit

## accelerate config default

**Command**:

`accelerate config default` or `accelerate-config default`

Create a default config file for Accelerate with only a few flags set.

**Usage**: 

```bash
accelerate config default [arguments]
```

**Optional Arguments**:
* `--config_file CONFIG_FILE` (`str`) -- The path to use to store the config file. Will default to a file named default_config.yaml in the cache location, which is the content
                        of the environment `HF_HOME` suffixed with 'accelerate', or if you don't have such an environment variable, your cache directory
                        (`~/.cache` or the content of `XDG_CACHE_HOME`) suffixed with `huggingface`.

* `-h`, `--help` (`bool`) -- Show a help message and exit
* `--mixed_precision {no,fp16,bf16}` (`str`) -- Whether or not to use mixed precision training. Choose between FP16 and BF16 (bfloat16) training. BF16 training is only supported on Nvidia Ampere GPUs and PyTorch 1.10 or later.

## accelerate config update

**Command**:

`accelerate config update` or `accelerate-config update`

Update an existing config file with the latest defaults while maintaining the old configuration.

**Usage**: 

```bash
accelerate config update [arguments]
```

**Optional Arguments**:
* `--config_file CONFIG_FILE` (`str`) -- The path to the config file to update. Will default to a file named default_config.yaml in the cache location, which is the content
                        of the environment `HF_HOME` suffixed with 'accelerate', or if you don't have such an environment variable, your cache directory
                        (`~/.cache` or the content of `XDG_CACHE_HOME`) suffixed with `huggingface`.

* `-h`, `--help` (`bool`) -- Show a help message and exit


## accelerate env

**Command**:

`accelerate env` or `accelerate-env` or `python -m accelerate.commands.env`

Lists the contents of the passed 🤗 Accelerate configuration file. Should always be used when opening an issue on the [GitHub repository](https://github.com/huggingface/accelerate).

**Usage**:

```bash
accelerate env [arguments]
```

**Optional Arguments**:
* `--config_file CONFIG_FILE` (`str`) -- The path to use to store the config file. Will default to a file named default_config.yaml in the cache location, which is the content
                        of the environment `HF_HOME` suffixed with 'accelerate', or if you don't have such an environment variable, your cache directory
                        (`~/.cache` or the content of `XDG_CACHE_HOME`) suffixed with `huggingface`.
* `-h`, `--help` (`bool`) -- Show a help message and exit

## accelerate launch

**Command**:

`accelerate launch` or `accelerate-launch` or `python -m accelerate.commands.launch`

Launches a specified script on a distributed system with the right parameters.

**Usage**: 

```bash
accelerate launch [arguments] {training_script} --{training_script-argument-1} --{training_script-argument-2} ...
```

**Positional Arguments**:

- `{training_script}` -- The full path to the script to be launched in parallel
- `--{training_script-argument-1}` -- Arguments of the training script

**Optional Arguments**:

* `-h`, `--help` (`bool`) -- Show a help message and exit
* `--config_file CONFIG_FILE` (`str`)-- The config file to use for the default values in the launching script.
* `-m`, `--module` (`bool`) -- Change each process to interpret the launch script as a Python module, executing with the same behavior as 'python -m'.
* `--no_python` (`bool`) -- Skip prepending the training script with 'python' - just execute it directly. Useful when the script is not a Python script.
* `--debug` (`bool`) -- Whether to print out the torch.distributed stack trace when something fails.
* `-q`, `--quiet` (`bool`) -- Silence subprocess errors from the launch stack trace to only show the relevant tracebacks. (Only applicable to DeepSpeed and single-process configurations).


The rest of these arguments are configured through `accelerate config` and are read in from the specified `--config_file` (or default configuration) for their 
values. They can also be passed in manually.

**Hardware Selection Arguments**:

* `--cpu` (`bool`) -- Whether or not to force the training on the CPU.
* `--multi_gpu` (`bool`) -- Whether or not this should launch a distributed GPU training.
* `--tpu` (`bool`) -- Whether or not this should launch a TPU training.

**Resource Selection Arguments**:

The following arguments are useful for fine-tuning how available hardware should be used

* `--mixed_precision {no,fp16,bf16,fp8}` (`str`) -- Whether or not to use mixed precision training. Choose between FP16 and BF16 (bfloat16) training. BF16 training is only supported on Nvidia Ampere GPUs and PyTorch 1.10 or later.
* `--num_processes NUM_PROCESSES` (`int`) -- The total number of processes to be launched in parallel.
* `--num_machines NUM_MACHINES` (`int`) -- The total number of machines used in this training.
* `--num_cpu_threads_per_process NUM_CPU_THREADS_PER_PROCESS` (`int`) -- The number of CPU threads per process. Can be tuned for optimal performance.
* `--enable_cpu_affinity` (`bool`) -- Whether or not CPU affinity and balancing should be enabled. Currently only supported on NVIDIA hardware.

**Training Paradigm Arguments**:

The following arguments are useful for selecting which training paradigm to use.

* `--use_deepspeed` (`bool`) -- Whether or not to use DeepSpeed for training.
* `--use_fsdp` (`bool`) -- Whether or not to use FullyShardedDataParallel for training.
* `--use_megatron_lm` (`bool`) -- Whether or not to use Megatron-LM for training.

**Distributed GPU Arguments**:

The following arguments are only useful when `multi_gpu` is passed or multi-gpu training is configured through `accelerate config`: 

* `--gpu_ids` (`str`) -- What GPUs (by id) should be used for training on this machine as a comma-separated list
* `--same_network` (`bool`) -- Whether all machines used for multinode training exist on the same local network.
* `--machine_rank` (`int`) -- The rank of the machine on which this script is launched.
* `--main_process_ip` (`str`) -- The IP address of the machine of rank 0.
* `--main_process_port` (`int`) -- The port to use to communicate with the machine of rank 0.
* `-t`, `--tee` (`str`) -- Tee std streams into a log file and also to console.
* `--log_dir` (`str`) -- Base directory to use for log files when using torchrun/torch.distributed.run as launcher. Use with --tee to redirect std streams info log files.
* `--role` (`str`) -- User-defined role for the workers.
* `--rdzv_backend` (`str`) -- The rendezvous method to use, such as 'static' (the default) or 'c10d'
* `--rdzv_conf` (`str`) -- Additional rendezvous configuration (<key1>=<value1>,<key2>=<value2>,...).
* `--max_restarts` (`int`) -- Maximum number of worker group restarts before failing.
* `--monitor_interval` (`int`) -- Interval, in seconds, to monitor the state of workers.

**TPU Arguments**:

The following arguments are only useful when `tpu` is passed or TPU training is configured through `accelerate config`: 

* `--tpu_cluster` (`bool`) -- Whether to use a GCP TPU pod for training.
* `--tpu_use_sudo` (`bool`) -- Whether to use `sudo` when running the TPU training script in each pod.
* `--vm` (`str`) -- List of single Compute VM instance names. If not provided we assume usage of instance groups. For TPU pods.
* `--env` (`str`) -- List of environment variables to set on the Compute VM instances. For TPU pods.
* `--main_training_function` (`str`) -- The name of the main function to be executed in your script (only for TPU training).
* `--downcast_bf16` (`bool`) -- Whether when using bf16 precision on TPUs if both float and double tensors are cast to bfloat16 or if double tensors remain as float32.

**DeepSpeed Arguments**:

The following arguments are only useful when `use_deepspeed` is passed or `deepspeed` is configured through `accelerate config`: 

* `--deepspeed_config_file` (`str`) -- DeepSpeed config file.
* `--zero_stage` (`int`) -- DeepSpeed's ZeRO optimization stage.
* `--offload_optimizer_device` (`str`) -- Decides where (none|cpu|nvme) to offload optimizer states.
* `--offload_param_device` (`str`) -- Decides where (none|cpu|nvme) to offload parameters.
* `--offload_optimizer_nvme_path` (`str`) -- Decides Nvme Path to offload optimizer states.
* `--gradient_accumulation_steps` (`int`) -- No of gradient_accumulation_steps used in your training script.
* `--gradient_clipping` (`float`) -- Gradient clipping value used in your training script.
* `--zero3_init_flag` (`str`) -- Decides Whether (true|false) to enable `deepspeed.zero.Init` for constructing massive models. Only applicable with DeepSpeed ZeRO Stage-3.
* `--zero3_save_16bit_model` (`str`) -- Decides Whether (true|false) to save 16-bit model weights when using ZeRO Stage-3. Only applicable with DeepSpeed ZeRO Stage-3.
* `--deepspeed_hostfile` (`str`) -- DeepSpeed hostfile for configuring multi-node compute resources.
* `--deepspeed_exclusion_filter` (`str`) -- DeepSpeed exclusion filter string when using multi-node setup.
* `--deepspeed_inclusion_filter` (`str`) -- DeepSpeed inclusion filter string when using multi-node setup.
* `--deepspeed_multinode_launcher` (`str`) -- DeepSpeed multi-node launcher to use.
* `--deepspeed_moe_layer_cls_names` (`str`) -- comma-separated list of transformer MoE layer class names (case-sensitive) to wrap, e.g, `MixtralSparseMoeBlock` `Qwen2MoeSparseMoeBlock`, `JetMoEAttention,JetMoEBlock`

**Fully Sharded Data Parallelism Arguments**:

The following arguments are only useful when `use_fsdp` is passed or Fully Sharded Data Parallelism is configured through `accelerate config`:

* `--fsdp_offload_params` (`str`) -- Decides Whether (true|false) to offload parameters and gradients to CPU.
* `--fsdp_min_num_params` (`int`) -- FSDP's minimum number of parameters for Default Auto Wrapping.
* `--fsdp_sharding_strategy` (`int`) -- FSDP's Sharding Strategy.
* `--fsdp_auto_wrap_policy` (`str`) -- FSDP's auto wrap policy.
* `--fsdp_transformer_layer_cls_to_wrap` (`str`) -- Transformer layer class name (case-sensitive) to wrap, e.g, `BertLayer`, `GPTJBlock`, `T5Block` ...
* `--fsdp_backward_prefetch_policy` (`str`) -- FSDP's backward prefetch policy.
* `--fsdp_state_dict_type` (`str`) -- FSDP's state dict type.
* `--fsdp_forward_prefetch` (`str`) -- FSDP forward prefetch.
* `--fsdp_use_orig_params` (`str`) -- If True, allows non-uniform `requires_grad` mixed in a FSDP unit.
* `--fsdp_cpu_ram_efficient_loading` (`str`) -- If true, only the first process loads the pretrained model checkoint while all other processes have empty weights. When using this, `--fsdp_sync_module_states` needs to True.
* `--fsdp_sync_module_states` (`str`) -- If true, each individually wrapped FSDP unit will broadcast module parameters from rank 0.
* `--fsdp_activation_checkpointing` (`bool`) -- Decides Whether intermediate activations are freed during the forward pass, and a checkpoint is left as a placeholder

**Megatron-LM Arguments**:

The following arguments are only useful when `use_megatron_lm` is passed or Megatron-LM is configured through `accelerate config`:

* `--megatron_lm_tp_degree` (``) -- Megatron-LM's Tensor Parallelism (TP) degree.
* `--megatron_lm_pp_degree` (``) -- Megatron-LM's Pipeline Parallelism (PP) degree.
* `--megatron_lm_num_micro_batches` (``) -- Megatron-LM's number of micro batches when PP degree > 1.
* `--megatron_lm_sequence_parallelism` (``) -- Decides Whether (true|false) to enable Sequence Parallelism when TP degree > 1.
* `--megatron_lm_recompute_activations` (``) -- Decides Whether (true|false) to enable Selective Activation Recomputation.
* `--megatron_lm_use_distributed_optimizer` (``) -- Decides Whether (true|false) to use distributed optimizer which shards optimizer state and gradients across Data Parallel (DP) ranks.
* `--megatron_lm_gradient_clipping` (``) -- Megatron-LM's gradient clipping value based on global L2 Norm (0 to disable).

**FP8 Arguments**:

* `--fp8_backend` (`str`) -- Choose a backend to train with FP8 (`te` or `msamp`)
* `--fp8_use_autocast_during_eval` (`bool`) -- Whether to use FP8 autocast during eval mode (useful only when `--fp8_backend=te` is passed). Generally better metrics are found when this is not passed.
* `--fp8_margin` (`int`) -- The margin to use for the gradient scaling (useful only when `--fp8_backend=te` is passed).
* `--fp8_interval` (`int`) -- The interval to use for how often the scaling factor is recomputed (useful only when `--fp8_backend=te` is passed).
* `--fp8_format` (`str`) -- The format to use for the FP8 recipe (useful only when `--fp8_backend=te` is passed).
* `--fp8_amax_history_len` (`int`) -- The length of the history to use for the scaling factor computation (useful only when `--fp8_backend=te` is passed).
* `--fp8_amax_compute_algo` (`str`) -- The algorithm to use for the scaling factor computation. (useful only when `--fp8_backend=te` is passed).
* `--fp8_override_linear_precision` (`Tuple[bool, bool, bool]`) -- Whether or not to execute `fprop`, `dgrad`, and `wgrad` GEMMS in higher precision.
* `--fp8_opt_level` (`str`) -- What level of 8-bit collective communication should be used with MS-AMP (useful only when `--fp8_backend=msamp` is passed)

**AWS SageMaker Arguments**:

The following arguments are only useful when training in SageMaker

* `--aws_access_key_id AWS_ACCESS_KEY_ID` (`str`) -- The AWS_ACCESS_KEY_ID used to launch the Amazon SageMaker training job
* `--aws_secret_access_key AWS_SECRET_ACCESS_KEY` (`str`) -- The AWS_SECRET_ACCESS_KEY used to launch the Amazon SageMaker training job

## accelerate estimate-memory

**Command**:

`accelerate estimate-memory` or `accelerate-estimate-memory` or `python -m accelerate.commands.estimate`

Estimates the total vRAM a particular model hosted on the Hub needs to be loaded in with an estimate for training. Requires that `huggingface_hub` be installed. 

<Tip>

    When performing inference, typically add ≤20% to the result as overall allocation [as referenced here](https://blog.eleuther.ai/transformer-math/). We will have more extensive estimations in the future that will automatically be included in the calculation.

</Tip>

**Usage**: 

```bash
accelerate estimate-memory {MODEL_NAME} --library_name {LIBRARY_NAME} --dtypes {dtype_1} {dtype_2} ...
```

**Required Arguments**:

* `MODEL_NAME` (`str`)-- The model name on the Hugging Face Hub

**Optional Arguments**:

* `--library_name {timm,transformers}` (`str`) -- The library the model has an integration with, such as `transformers`, needed only if this information is not stored on the Hub
* `--dtypes {float32,float16,int8,int4}` (`[{float32,float16,int8,int4} ...]`) -- The dtypes to use for the model, must be one (or many) of `float32`, `float16`, `int8`, and `int4`
* `--trust_remote_code` (`bool`) -- Whether or not to allow for custom models defined on the Hub in their own modeling files. This option should only be passed for repositories you trust and in which you have read the code, as it will execute code present on the Hub on your local machine.

## accelerate tpu-config

`accelerate tpu-config`

**Usage**:

```bash
accelerate tpu-config [arguments]
```

**Optional Arguments**:
* `-h`, `--help` (`bool`) -- Show a help message and exit

**Config Arguments**:

Arguments that can be configured through `accelerate config`.

* `--config_file` (`str`) -- Path to the config file to use for accelerate.
* `--tpu_name` (`str`) -- The name of the TPU to use. If not specified, will use the TPU specified in the config file.
* `--tpu_zone` (`str`) -- The zone of the TPU to use. If not specified, will use the zone specified in the config file.

**TPU Arguments**:

Arguments for options ran inside the TPU.

* `--command_file` (`str`) -- The path to the file containing the commands to run on the pod on startup.
* `--command` (`str`) -- A command to run on the pod. Can be passed multiple times.
* `--install_accelerate` (`bool`) -- Whether to install accelerate on the pod. Defaults to False.
* `--accelerate_version` (`str`) -- The version of accelerate to install on the pod. If not specified, will use the latest pypi version. Specify 'dev' to install from GitHub.
* `--debug` (`bool`) -- If set, will print the command that would be run instead of running it.

## accelerate test

`accelerate test` or `accelerate-test`

Runs `accelerate/test_utils/test_script.py` to verify that 🤗 Accelerate has been properly configured on your system and runs. 

**Usage**: 

```bash
accelerate test [arguments]
```

**Optional Arguments**:
* `--config_file CONFIG_FILE` (`str`) -- The path to use to store the config file. Will default to a file named default_config.yaml in the cache location, which is the content
                        of the environment `HF_HOME` suffixed with 'accelerate', or if you don't have such an environment variable, your cache directory
                        (`~/.cache` or the content of `XDG_CACHE_HOME`) suffixed with `huggingface`.
* `-h`, `--help` (`bool`) -- Show a help message and exit


================================================
FILE: docs/source/package_reference/deepspeed.md
================================================
<!--Copyright 2021 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# DeepSpeed utilities

## DeepSpeedPlugin

## get_active_deepspeed_plugin

[[autodoc]] utils.get_active_deepspeed_plugin

[[autodoc]] utils.DeepSpeedPlugin

[[autodoc]] utils.deepspeed.DummyScheduler

## DeepSpeedEnginerWrapper

[[autodoc]] utils.deepspeed.DeepSpeedEngineWrapper

## DeepSpeedOptimizerWrapper

[[autodoc]] utils.deepspeed.DeepSpeedOptimizerWrapper

## DeepSpeedSchedulerWrapper

[[autodoc]] utils.deepspeed.DeepSpeedSchedulerWrapper

## DummyOptim

[[autodoc]] utils.deepspeed.DummyOptim

## DummyScheduler

================================================
FILE: docs/source/package_reference/fp8.md
================================================
<!--Copyright 2021 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# FP8

Below are functions and classes relative to the underlying FP8 implementation

## FP8RecipeKwargs

[[autodoc]] utils.FP8RecipeKwargs

## convert_model

[[autodoc]] utils.convert_model

## has_transformer_engine_layers

[[autodoc]] utils.has_transformer_engine_layers

## contextual_fp8_autocast

[[autodoc]] utils.contextual_fp8_autocast

## apply_fp8_autowrap

[[autodoc]] utils.apply_fp8_autowrap


================================================
FILE: docs/source/package_reference/fsdp.md
================================================
<!--Copyright 2023 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Fully Sharded Data Parallel utilities

## enable_fsdp_ram_efficient_loading

[[autodoc]] utils.enable_fsdp_ram_efficient_loading

## disable_fsdp_ram_efficient_loading

[[autodoc]] utils.disable_fsdp_ram_efficient_loading

## merge_fsdp_weights

[[autodoc]] utils.merge_fsdp_weights

## FullyShardedDataParallelPlugin

[[autodoc]] utils.FullyShardedDataParallelPlugin

## fsdp2_load_full_state_dict

[[autodoc]] utils.fsdp2_load_full_state_dict

## fsdp2_switch_optimizer_parameters

[[autodoc]] utils.fsdp2_switch_optimizer_parameters

## fsdp2_prepare_model

[[autodoc]] utils.fsdp2_prepare_model

## fsdp2_prepare_auto_wrap_policy


================================================
FILE: docs/source/package_reference/inference.md
================================================
<!--Copyright 2024 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Pipeline parallelism

Accelerate supports pipeline parallelism for large-scale training with the PyTorch [torch.distributed.pipelining](https://pytorch.org/docs/stable/distributed.pipelining.html) API.

## prepare_pippy

[[autodoc]] inference.prepare_pippy


================================================
FILE: docs/source/package_reference/kwargs.md
================================================
<!--Copyright 2021 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Kwargs handlers

The following objects can be passed to the main [`Accelerator`] to customize how some PyTorch objects
related to distributed training or mixed precision are created.

## AutocastKwargs

[[autodoc]] AutocastKwargs

## DistributedDataParallelKwargs

[[autodoc]] DistributedDataParallelKwargs

## FP8RecipeKwargs

[[autodoc]] utils.FP8RecipeKwargs

## ProfileKwargs

[[autodoc]] utils.ProfileKwargs

## GradScalerKwargs

[[autodoc]] GradScalerKwargs

## InitProcessGroupKwargs

[[autodoc]] InitProcessGroupKwargs

## KwargsHandler

[[autodoc]] utils.KwargsHandler


================================================
FILE: docs/source/package_reference/launchers.md
================================================
<!--Copyright 2022 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Launchers

Functions for launching training on distributed processes.

## notebook_launcher

[[autodoc]] accelerate.notebook_launcher

## debug_launcher

[[autodoc]] accelerate.debug_launcher

================================================
FILE: docs/source/package_reference/logging.md
================================================
<!--Copyright 2021 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Logging

Refer to the [Troubleshooting guide](../usage_guides/troubleshooting#logging) or to the example below to learn 
how to use Accelerate's logger. 

[[autodoc]] logging.get_logger

================================================
FILE: docs/source/package_reference/megatron_lm.md
================================================
<!--Copyright 2021 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Megatron-LM utilities

## MegatronLMPlugin

[[autodoc]] utils.MegatronLMPlugin

## MegatronLMDummyScheduler

[[autodoc]] utils.MegatronLMDummyScheduler

## MegatronLMDummyDataLoader

[[autodoc]] utils.MegatronLMDummyDataLoader

## AbstractTrainStep

[[autodoc]] utils.AbstractTrainStep

## GPTTrainStep

[[autodoc]] utils.GPTTrainStep

## BertTrainStep

[[autodoc]] utils.BertTrainStep

## T5TrainStep

[[autodoc]] utils.T5TrainStep

## avg_losses_across_data_parallel_group

[[autodoc]] utils.avg_losses_across_data_parallel_group


================================================
FILE: docs/source/package_reference/state.md
================================================
<!--Copyright 2021 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Stateful Classes

Below are variations of a [singleton class](https://en.wikipedia.org/wiki/Singleton_pattern) in the sense that all
instances share the same state, which is initialized on the first instantiation.

These classes are immutable and store information about certain configurations or 
states.

## PartialState

[[autodoc]] state.PartialState

## AcceleratorState

[[autodoc]] state.AcceleratorState

## GradientState

[[autodoc]] state.GradientState

================================================
FILE: docs/source/package_reference/torch_wrappers.md
================================================
<!--Copyright 2021 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# DataLoaders, Optimizers, and Schedulers

The internal classes Accelerate uses to prepare objects for distributed training
when calling [`~Accelerator.prepare`].

## DataLoader utilities

[[autodoc]] data_loader.prepare_data_loader
[[autodoc]] data_loader.skip_first_batches

## BatchSamplerShard

[[autodoc]] data_loader.BatchSamplerShard

## IterableDatasetShard

[[autodoc]] data_loader.IterableDatasetShard

## DataLoaderShard

[[autodoc]] data_loader.DataLoaderShard

## DataLoaderDispatcher

[[autodoc]] data_loader.DataLoaderDispatcher

## AcceleratedOptimizer

[[autodoc]] optimizer.AcceleratedOptimizer

## AcceleratedScheduler

[[autodoc]] scheduler.AcceleratedScheduler

================================================
FILE: docs/source/package_reference/tracking.md
================================================
<!--Copyright 2022 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Experiment Trackers

## GeneralTracker

[[autodoc]] tracking.GeneralTracker

## TensorBoardTracker

[[autodoc]] tracking.TensorBoardTracker
    - __init__

## WandBTracker

[[autodoc]] tracking.WandBTracker
    - __init__

## Trackio

[[autodoc]] tracking.TrackioTracker
    - __init__

## CometMLTracker

[[autodoc]] tracking.CometMLTracker
    - __init__

## AimTracker

[[autodoc]] tracking.AimTracker
    - __init__

## MLflowTracker

[[autodoc]] tracking.MLflowTracker
    - __init__

## ClearMLTracker

[[autodoc]] tracking.ClearMLTracker
    - __init__

## SwanLabTracker

[[autodoc]] tracking.SwanLabTracker
    - __init__


================================================
FILE: docs/source/package_reference/utilities.md
================================================
<!--Copyright 2021 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Utility functions and classes

Below are a variety of utility functions that 🤗 Accelerate provides, broken down by use-case. 

## Constants

Constants used throughout 🤗 Accelerate for reference

The following are constants used when utilizing [`Accelerator.save_state`]

`utils.MODEL_NAME`: `"pytorch_model"`
`utils.OPTIMIZER_NAME`: `"optimizer"`
`utils.RNG_STATE_NAME`: `"random_states"`
`utils.SCALER_NAME`: `"scaler.pt`
`utils.SCHEDULER_NAME`: `"scheduler`

The following are constants used when utilizing [`Accelerator.save_model`]

`utils.WEIGHTS_NAME`: `"pytorch_model.bin"`
`utils.SAFE_WEIGHTS_NAME`: `"model.safetensors"`
`utils.WEIGHTS_INDEX_NAME`: `"pytorch_model.bin.index.json"`
`utils.SAFE_WEIGHTS_INDEX_NAME`: `"model.safetensors.index.json"`

## Data Classes

These are basic dataclasses used throughout 🤗 Accelerate and they can be passed in as parameters.

### Standalone

These are standalone dataclasses used for checks, such as the type of distributed system being used

[[autodoc]] utils.ComputeEnvironment

[[autodoc]] utils.DistributedType

[[autodoc]] utils.DynamoBackend

[[autodoc]] utils.LoggerType

[[autodoc]] utils.PrecisionType

[[autodoc]] utils.RNGType

[[autodoc]] utils.SageMakerDistributedType

### Kwargs

These are configurable arguments for specific interactions throughout the PyTorch ecosystem that Accelerate handles under the hood.

[[autodoc]] utils.AutocastKwargs

[[autodoc]] utils.DistributedDataParallelKwargs

[[autodoc]] utils.FP8RecipeKwargs

[[autodoc]] utils.GradScalerKwargs

[[autodoc]] utils.InitProcessGroupKwargs

[[autodoc]] utils.KwargsHandler

## Plugins

These are plugins that can be passed to the [`Accelerator`] object. While they are defined elsewhere in the documentation, 
for convenience all of them are available to see here:

[[autodoc]] utils.DeepSpeedPlugin

[[autodoc]] utils.FullyShardedDataParallelPlugin

[[autodoc]] utils.GradientAccumulationPlugin

[[autodoc]] utils.MegatronLMPlugin

[[autodoc]] utils.TorchDynamoPlugin

## Configurations

These are classes which can be configured and passed through to the appropriate integration

[[autodoc]] utils.BnbQuantizationConfig

[[autodoc]] utils.DataLoaderConfiguration

[[autodoc]] utils.ProjectConfiguration

## Environmental Variables

These are environmental variables that can be enabled for different use cases

* `ACCELERATE_DEBUG_MODE` (`str`): Whether to run accelerate in debug mode. More info available [here](../usage_guides/debug.md).


## Data Manipulation and Operations

These include data operations that mimic the same `torch` ops but can be used on distributed processes.

[[autodoc]] utils.broadcast

[[autodoc]] utils.broadcast_object_list

[[autodoc]] utils.concatenate

[[autodoc]] utils.convert_outputs_to_fp32

[[autodoc]] utils.convert_to_fp32

[[autodoc]] utils.gather

[[autodoc]] utils.gather_object

[[autodoc]] utils.get_grad_scaler

[[autodoc]] utils.get_mixed_precision_context_manager

[[autodoc]] utils.listify

[[autodoc]] utils.pad_across_processes

[[autodoc]] utils.recursively_apply

[[autodoc]] utils.reduce

[[autodoc]] utils.send_to_device

[[autodoc]] utils.slice_tensors

## Environment Checks

These functionalities check the state of the current working environment including information about the operating system itself, what it can support, and if particular dependencies are installed. 

[[autodoc]] utils.is_bf16_available

[[autodoc]] utils.is_mps_available

[[autodoc]] utils.is_npu_available

[[autodoc]] utils.is_torch_version

[[autodoc]] utils.is_torch_xla_available

[[autodoc]] utils.is_xpu_available

## Environment Manipulation

[[autodoc]] utils.patch_environment

[[autodoc]] utils.clear_environment

[[autodoc]] utils.write_basic_config

When setting up 🤗 Accelerate for the first time, rather than running `accelerate config` [~utils.write_basic_config] can be used as an alternative for quick configuration.

[[autodoc]] utils.set_numa_affinity

[[autodoc]] utils.environment.override_numa_affinity

[[autodoc]] utils.purge_accelerate_environment

## Memory

[[autodoc]] utils.find_executable_batch_size

## Modeling

These utilities relate to interacting with PyTorch models

[[autodoc]] utils.calculate_maximum_sizes

[[autodoc]] utils.compute_module_sizes

[[autodoc]] utils.extract_model_from_parallel

[[autodoc]] utils.get_balanced_memory

[[autodoc]] utils.get_max_layer_size

[[autodoc]] utils.infer_auto_device_map

[[autodoc]] utils.load_checkpoint_in_model

[[autodoc]] utils.load_offloaded_weights

[[autodoc]] utils.load_state_dict

[[autodoc]] utils.offload_state_dict

[[autodoc]] utils.retie_parameters

[[autodoc]] utils.set_module_tensor_to_device

[[autodoc]] utils.get_module_children_bottom_up

## Parallel

These include general utilities that should be used when working in parallel.

[[autodoc]] utils.extract_model_from_parallel

[[autodoc]] utils.save

[[autodoc]] utils.load

[[autodoc]] utils.wait_for_everyone


## Random

These utilities relate to setting and synchronizing of all the random states.

[[autodoc]] utils.set_seed

[[autodoc]] utils.synchronize_rng_state

[[autodoc]] utils.synchronize_rng_states


## PyTorch XLA

These include utilities that are useful while using PyTorch with XLA.

[[autodoc]] utils.install_xla

## Loading model weights

These include utilities that are useful to load checkpoints.

[[autodoc]] utils.load_checkpoint_in_model

## Quantization

These include utilities that are useful to quantize model.

[[autodoc]] utils.load_and_quantize_model


================================================
FILE: docs/source/quicktour.md
================================================
<!--Copyright 2021 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Quicktour

There are many ways to launch and run your code depending on your training environment ([torchrun](https://pytorch.org/docs/stable/elastic/run.html), [DeepSpeed](https://www.deepspeed.ai/), etc.) and available hardware. Accelerate offers a unified interface for launching and training on different distributed setups, allowing you to focus on your PyTorch training code instead of the intricacies of adapting your code to these different setups. This allows you to easily scale your PyTorch code for training and inference on distributed setups with hardware like GPUs and TPUs. Accelerate also provides Big Model Inference to make loading and running inference with really large models that usually don't fit in memory more accessible.

This quicktour introduces the three main features of Accelerate:

* a unified command line launching interface for distributed training scripts
* a training library for adapting PyTorch training code to run on different distributed setups
* Big Model Inference

## Unified launch interface

Accelerate automatically selects the appropriate configuration values for any given distributed training framework (DeepSpeed, FSDP, etc.) through a unified configuration file generated from the [`accelerate config`](package_reference/cli#accelerate-config) command. You could also pass the configuration values explicitly to the command line which is helpful in certain situations like if you're using SLURM.


But in most cases, you should always run [`accelerate config`](package_reference/cli#accelerate-config) first to help Accelerate learn about your training setup.

```bash
accelerate config
```

The [`accelerate config`](package_reference/cli#accelerate-config) command creates and saves a default_config.yaml file in Accelerate's cache folder. This file stores the configuration for your training environment, which helps Accelerate correctly launch your training script based on your machine.

After you've configured your environment, you can test your setup with [`accelerate test`](package_reference/cli#accelerate-test), which launches a short script to test the distributed environment.

```bash
accelerate test
```

> [!TIP]
> Add `--config_file` to the `accelerate test` or `accelerate launch` command to specify the location of the configuration file if it is saved in a non-default location like the cache.

Once your environment is set up, launch your training script with [`accelerate launch`](package_reference/cli#accelerate-launch)!

```bash
accelerate launch path_to_script.py --args_for_the_script
```

To learn more, check out the [Launch distributed code](basic_tutorials/launch) tutorial for more information about launching your scripts.

We also have a [configuration zoo](https://github.com/huggingface/accelerate/blob/main/examples/config_yaml_templates) which showcases a number of premade **minimal** example configurations for a variety of setups you can run.

## Adapt training code

The next main feature of Accelerate is the [`Accelerator`] class which adapts your PyTorch code to run on different distributed setups.

You only need to add a few lines of code to your training script to enable it to run on multiple GPUs or TPUs.

```diff
+ from accelerate import Accelerator
+ accelerator = Accelerator()

+ device = accelerator.device
+ model, optimizer, training_dataloader, scheduler = accelerator.prepare(
+     model, optimizer, training_dataloader, scheduler
+ )

  for batch in training_dataloader:
      optimizer.zero_grad()
      inputs, targets = batch
-     inputs = inputs.to(device)
-     targets = targets.to(device)
      outputs = model(inputs)
      loss = loss_function(outputs, targets)
+     accelerator.backward(loss)
      optimizer.step()
      scheduler.step()
```

1. Import and instantiate the [`Accelerator`] class at the beginning of your training script. The [`Accelerator`] class initializes everything necessary for distributed training, and it automatically detects your training environment (a single machine with a GPU, a machine with several GPUs, several machines with multiple GPUs or a TPU, etc.) based on how the code was launched.

```python
from accelerate import Accelerator

accelerator = Accelerator()
```

2. Remove calls like `.cuda()` on your model and input data. The [`Accelerator`] class automatically places these objects on the appropriate device for you.

> [!WARNING]
> This step is *optional* but it is considered best practice to allow Accelerate to handle device placement. You could also deactivate automatic device placement by passing `device_placement=False` when initializing the [`Accelerator`]. If you want to explicitly place objects on a device with `.to(device)`, make sure you use `accelerator.device` instead. For example, if you create an optimizer before placing a model on `accelerator.device`, training fails on a TPU.

> [!WARNING]
> Accelerate does not use non-blocking transfers by default for its automatic device placement, which can result in potentially unwanted CUDA synchronizations.  You can enable non-blocking transfers by passing a [`~utils.dataclasses.DataLoaderConfiguration`] with `non_blocking=True` set as the `dataloader_config` when initializing the [`Accelerator`].  As usual, non-blocking transfers will only work if the dataloader also has `pin_memory=True` set.  Be wary that using non-blocking transfers from GPU to CPU may cause incorrect results if it results in CPU operations being performed on non-ready tensors.

```py
device = accelerator.device
```

3. Pass all relevant PyTorch objects for training (optimizer, model, dataloader(s), learning rate scheduler) to the [`~Accelerator.prepare`] method as soon as they're created. This method wraps the model in a container optimized for your distributed setup, uses Accelerates version of the optimizer and scheduler, and creates a sharded version of your dataloader for distribution across GPUs or TPUs.

```python
model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
    model, optimizer, train_dataloader, lr_scheduler
)
```

4. Replace `loss.backward()` with [`~Accelerator.backward`] to use the correct `backward()` method for your training setup.

```py
accelerator.backward(loss)
```

Read [Accelerate’s internal mechanisms](concept_guides/internal_mechanism) guide to learn more details about how Accelerate adapts your code.

### Distributed evaluation

To perform distributed evaluation, pass your validation dataloader to the [`~Accelerator.prepare`] method:

```python
validation_dataloader = accelerator.prepare(validation_dataloader)
```

Each device in your distributed setup only receives a part of the evaluation data, which means you should group your predictions together with the [`~Accelerator.gather_for_metrics`] method. This method requires all tensors to be the same size on each process, so if your tensors have different sizes on each process (for instance when dynamically padding to the maximum length in a batch), you should use the [`~Accelerator.pad_across_processes`] method to pad you tensor to the largest size across processes. Note that the tensors needs to be 1D and that we concatenate the tensors along the first dimension. 

```python
for inputs, targets in validation_dataloader:
    predictions = model(inputs)
    # Gather all predictions and targets
    all_predictions, all_targets = accelerator.gather_for_metrics((predictions, targets))
    # Example of use with a *Datasets.Metric*
    metric.add_batch(all_predictions, all_targets)
```

For more complex cases (e.g. 2D tensors, don't want to concatenate tensors, dict of 3D tensors), you can pass `use_gather_object=True` in `gather_for_metrics`. This will return the list of objects after gathering. Note that using it with GPU tensors is not well supported and inefficient.

> [!TIP]
> Data at the end of a dataset may be duplicated so the batch can be equally divided among all workers. The [`~Accelerator.gather_for_metrics`] method automatically removes the duplicated data to calculate a more accurate metric.

## Big Model Inference

Accelerate's Big Model Inference has two main features, [`~accelerate.init_empty_weights`] and [`~accelerate.load_checkpoint_and_dispatch`], to load large models for inference that typically don't fit into memory.

> [!TIP]
> Take a look at the [Handling big models for inference](concept_guides/big_model_inference) guide for a better understanding of how Big Model Inference works under the hood.

### Empty weights initialization

The [`~accelerate.init_empty_weights`] context manager initializes models of any size by creating a *model skeleton* and moving and placing parameters each time they're created to PyTorch's [**meta**](https://pytorch.org/docs/main/meta.html) device. This way, not all weights are immediately loaded and only a small part of the model is loaded into memory at a time.

For example, loading an empty [Mixtral-8x7B](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) model takes significantly less memory than fully loading the models and weights on the CPU.

```py
from accelerate import init_empty_weights
from transformers import AutoConfig, AutoModelForCausalLM

config = AutoConfig.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1")
with init_empty_weights():
    model = AutoModelForCausalLM.from_config(config)
```

### Load and dispatch weights

The [`~accelerate.load_checkpoint_and_dispatch`] function loads full or sharded checkpoints into the empty model, and automatically distribute weights across all available devices.

The `device_map` parameter determines where to place each model layer, and specifying `"auto"` places them on the GPU first, then the CPU, and finally the hard drive as memory-mapped tensors if there's still not enough memory. Use the `no_split_module_classes` parameter to indicate which modules shouldn't be split across devices (typically those with a residual connection).

```py
from accelerate import load_checkpoint_and_dispatch

model_checkpoint = "your-local-model-folder"
model = load_checkpoint_and_dispatch(
    model, checkpoint=model_checkpoint, device_map="auto", no_split_module_classes=['Block']
)
```

## Next steps

Now that you've been introduced to the main Accelerate features, your next steps could include:

* Check out the [tutorials](basic_tutorials/overview) for a gentle walkthrough of Accelerate. This is especially useful if you're new to distributed training and the library.
* Dive into the [guides](usage_guides/explore) to see how to use Accelerate for specific use-cases.
* Deepen your conceptual understanding of how Accelerate works internally by reading the [concept guides](concept_guides/internal_mechanism).
* Look up classes and commands in the [API reference](package_reference/accelerator) to see what parameters and options are available.


================================================
FILE: docs/source/usage_guides/big_modeling.md
================================================
<!--Copyright 2022 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Big Model Inference

One of the biggest advancements Accelerate provides is [Big Model Inference](../concept_guides/big_model_inference), which allows you to perform inference with models that don't fully fit on your graphics card.

This tutorial will show you how to use Big Model Inference in Accelerate and the Hugging Face ecosystem.

## Accelerate

A typical workflow for loading a PyTorch model is shown below. `ModelClass` is a model that exceeds the GPU memory of your device (mps or cuda or xpu).

```py
import torch

my_model = ModelClass(...)
state_dict = torch.load(checkpoint_file)
my_model.load_state_dict(state_dict)
```

With Big Model Inference, the first step is to init an empty skeleton of the model with the `init_empty_weights` context manager. This doesn't require any memory because `my_model` is "parameterless".

```py
from accelerate import init_empty_weights
with init_empty_weights():
    my_model = ModelClass(...)
```

Next, the weights are loaded into the model for inference.

The [`load_checkpoint_and_dispatch`] method loads a checkpoint inside your empty model and dispatches the weights for each layer across all available devices, starting with the fastest devices (GPU, MPS, XPU, NPU, MLU, SDAA, MUSA) first before moving to the slower ones (CPU and hard drive).

Setting `device_map="auto"` automatically fills all available space on the GPU(s) first, then the CPU, and finally, the hard drive (the absolute slowest option) if there is still not enough memory.

> [!TIP]
> Refer to the [Designing a device map](../concept_guides/big_model_inference#designing-a-device-map) guide for more details on how to design your own device map.

```py
from accelerate import load_checkpoint_and_dispatch

model = load_checkpoint_and_dispatch(
    model, checkpoint=checkpoint_file, device_map="auto"
)
```

If there are certain “chunks” of layers that shouldn’t be split, pass them to `no_split_module_classes` (see [here](../concept_guides/big_model_inference#loading-weights) for more details).

A models weights can also be sharded into multiple checkpoints to save memory, such as when the `state_dict` doesn't fit in memory (see [here](../concept_guides/big_model_inference#sharded-checkpoints) for more details).

Now that the model is fully dispatched, you can perform inference.

```py
input = torch.randn(2,3)
device_type = next(iter(model.parameters())).device.type
input = input.to(device_type)
output = model(input)
```

Each time an input is passed through a layer, it is sent from the CPU to the GPU (or disk to CPU to GPU), the output is calculated, and the layer is removed from the GPU going back down the line. While this adds some overhead to inference, it enables you to run any size model on your system, as long as the largest layer fits on your GPU.

Multiple GPUs, or "model parallelism", can be utilized but only one GPU will be active at any given moment. This forces the GPU to wait for the previous GPU to send it the output. You should launch your script normally with Python instead of other tools like torchrun and accelerate launch.

> [!TIP]
> You may also be interested in *pipeline parallelism* which utilizes all available GPUs at once, instead of only having one GPU active at a time. This approach is less flexible though. For more details, refer to the [Memory-efficient pipeline parallelism](./distributed_inference#memory-efficient-pipeline-parallelism-experimental) guide.

<Youtube id="MWCSGj9jEAo"/>

Take a look at a full example of Big Model Inference below.

```py
import torch
from accelerate import init_empty_weights, load_checkpoint_and_dispatch

with init_empty_weights():
    model = MyModel(...)

model = load_checkpoint_and_dispatch(
    model, checkpoint=checkpoint_file, device_map="auto"
)

input = torch.randn(2,3)
device_type = next(iter(model.parameters())).device.type
input = input.to(device_type)
output = model(input)
```

## Hugging Face ecosystem

Other libraries in the Hugging Face ecosystem, like Transformers or Diffusers, supports Big Model Inference in their [`~transformers.PreTrainedModel.from_pretrained`] constructors.

You just need to add `device_map="auto"` in [`~transformers.PreTrainedModel.from_pretrained`] to enable Big Model Inference.

For example, load Big Sciences T0pp 11 billion parameter model with Big Model Inference.

```py
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0pp", device_map="auto")
```

After loading the model, the empty init and smart dispatch steps from before are executed and the model is fully ready to make use of all the resources in your machine. Through these constructors, you can also save more memory by specifying the `torch_dtype` parameter to load a model in a lower precision.

```py
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0pp", device_map="auto", torch_dtype=torch.float16)
```

## Next steps

For a more detailed explanation of Big Model Inference, make sure to check out the [conceptual guide](../concept_guides/big_model_inference)!


================================================
FILE: docs/source/usage_guides/checkpoint.md
================================================
<!--Copyright 2022 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Checkpointing

When training a PyTorch model with Accelerate, you may often want to save and continue a state of training. Doing so requires
saving and loading the model, optimizer, RNG generators, and the GradScaler. Inside Accelerate are two convenience functions to achieve this quickly:
- Use [`~Accelerator.save_state`] for saving everything mentioned above to a folder location
- Use [`~Accelerator.load_state`] for loading everything stored from an earlier `save_state`

To further customize where and how states are saved through [`~Accelerator.save_state`] the [`~utils.ProjectConfiguration`] class can be used. For example 
if `automatic_checkpoint_naming` is enabled each saved checkpoint will be located then at `Accelerator.project_dir/checkpoints/checkpoint_{checkpoint_number}`.

It should be noted that the expectation is that those states come from the same training script, they should not be from two separate scripts.

- By using [`~Accelerator.register_for_checkpointing`], you can register custom objects to be automatically stored or loaded from the two prior functions,
so long as the object has a `state_dict` **and** a `load_state_dict` functionality. This could include objects such as a learning rate scheduler. 


Below is a brief example using checkpointing to save and reload a state during training:

```python
from accelerate import Accelerator
import torch

accelerator = Accelerator(project_dir="my/save/path")

my_scheduler = torch.optim.lr_scheduler.StepLR(my_optimizer, step_size=1, gamma=0.99)
my_model, my_optimizer, my_training_dataloader = accelerator.prepare(my_model, my_optimizer, my_training_dataloader)

# Register the LR scheduler
accelerator.register_for_checkpointing(my_scheduler)

# Save the starting state
accelerator.save_state()

device = accelerator.device
my_model.to(device)

# Perform training
for epoch in range(num_epochs):
    for batch in my_training_dataloader:
        my_optimizer.zero_grad()
        inputs, targets = batch
        inputs = inputs.to(device)
        targets = targets.to(device)
        outputs = my_model(inputs)
        loss = my_loss_function(outputs, targets)
        accelerator.backward(loss)
        my_optimizer.step()
    my_scheduler.step()

# Restore the previous state
accelerator.load_state("my/save/path/checkpointing/checkpoint_0")
```

## Restoring the state of the DataLoader 

After resuming from a checkpoint, it may also be desirable to resume from a particular point in the active `DataLoader` if 
the state was saved during the middle of an epoch. You can use [`~Accelerator.skip_first_batches`] to do so. 

```python
from accelerate import Accelerator

accelerator = Accelerator(project_dir="my/save/path")

train_dataloader = accelerator.prepare(train_dataloader)
accelerator.load_state("my_state")

# Assume the checkpoint was saved 100 steps into the epoch
skipped_dataloader = accelerator.skip_first_batches(train_dataloader, 100)

# After the first iteration, go back to `train_dataloader`

# First epoch
for batch in skipped_dataloader:
    # Do something
    pass

# Second epoch
for batch in train_dataloader:
    # Do something
    pass
```


================================================
FILE: docs/source/usage_guides/compilation.md
================================================
# Compilation

## Overview

Pytorch 2.0 introduced `torch.compile`, a powerful feature that makes PyTorch code run faster by JIT-compiling PyTorch code into optimized kernels. Key features of `torch.compile` include:

- **Performance Improvement**: Significantly speeds up model execution by optimizing the computation graph.
- **Ease of Use**: Requires minimal code changes to implement, making it highly accessible.
- **Compatibility**: Works seamlessly with existing PyTorch code and models.

When used with Accelerate, `torch.compile` integrates smoothly into distributed training workflows, allowing you to benefit from both distributed execution and compilation optimizations simultaneously.

The first execution of compiled code typically takes longer as it includes the compilation time, but subsequent runs are significantly faster. For optimal performance in different scenarios, `torch.compile` offers various modes like `"default"`, `"reduce-overhead"` (which uses CUDA graphs to further reduce overhead), and `"max-autotune"` (which performs extensive autotuning to find the best kernels for your model).

## Using `torch.compile` with Accelerate

Accelerate provides `TorchDynamoPlugin` for easy and seemless integration of `torch.compile` into your training scripts.

```python
from accelerate import Accelerator
from accelerate.utils import TorchDynamoPlugin

# Configure the compilation backend
dynamo_plugin = TorchDynamoPlugin(
    backend="inductor",  # Options: "inductor", "aot_eager", "aot_nvfuser", etc.
    mode="default",      # Options: "default", "reduce-overhead", "max-autotune"
    fullgraph=True,
    dynamic=False
)

# Initialize accelerator with the plugin
accelerator = Accelerator(dynamo_plugin=dynamo_plugin)
# This will apply torch.compile to your model
model = accelerator.prepare(model)
```

It is compatible with all other features and plugins of Accelerate, including mixed precision, distributed training (DDP, FSDP, Deepspeed), etc.

## Regional Compilation

Instead of trying to compile the whole model, which usually has a big problem space for optimization. Regional compilation targets repeated blocks of the same class and compiles them sequentially to hit the compiler's cache. For example, in `GPT2LMHeadModel`, the repeated block/class is `GPT2Block`, and can be accessed as `model.transformer.h[0]`. The rest of the model (e.g model.lm_head) is compiled separately.

This allows us to speed up the compilation overhead / cold start of models like LLMs and Transformers in general.
See <https://pytorch.org/tutorials/recipes/regional_compilation.html> for more details.

### How to Use Regional Compilation

It can be enabled by setting `use_regional_compilation=True` in the `TorchDynamoPlugin` configuration:

```python
# Configure the compilation backend
dynamo_plugin = TorchDynamoPlugin(
    use_regional_compilation=True,
    ... # other parameters
)
# Initialize accelerator with the plugin
accelerator = Accelerator(dynamo_plugin=dynamo_plugin)
# This will apply compile_regions to your model
model = accelerator.prepare(model)
```

You could also use the `accelerate.utils.compile_regions` utility directly the same way you would use `torch.compile`.

### Benefits of Regional Compilation

We have conducted extensive benchmarks comparing full compilation and regional compilation using the `torch.compile` feature in PyTorch. The full results are available in the [accelerate repository](https://github.com/huggingface/accelerate/tree/main/benchmarks/torch.compile/regional_compilation). The key findings from our benchmarks are:

1. **Comparable Performance**: Regional compilation delivers performance speedups similar to full compilation, especially for larger models.
2. **Faster Compilation**: Regional compilation significantly reduces the time taken to compile models, making it a more efficient choice for deployment.
3. **Batch Size Impact**: The performance difference between compilation strategies diminishes with larger batch sizes, indicating that the overhead of compilation is less impactful in those scenarios.
4. **Model Size Consideration**: The benefits of regional compilation are more pronounced in larger models, where the compilation time savings can be substantial.
5. **Practical Application**: For real-world applications, regional compilation is a practical choice for optimizing training cold start times, especially when working with large models.

## Conclusion

Both full and regional compilation can significantly speed up your models. Regional compilation offers a practical balance between compilation time and runtime performance, especially for training large models with substantial batch sizes.


================================================
FILE: docs/source/usage_guides/ddp_comm_hook.md
================================================
<!--
Copyright 2022 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# DDP Communication Hooks

Distributed Data Parallel (DDP) communication hooks provide a generic interface to control how gradients are communicated across workers by overriding the vanilla allreduce in `DistributedDataParallel`. A few built-in communication hooks are provided, and users can easily apply any of these hooks to optimize communication.


- **FP16 Compression Hook**: Compresses gradients by casting them to half-precision floating-point format (`torch.float16`), reducing communication overhead.
- **BF16 Compression Hook**: Similar to FP16, but uses the Brain Floating Point format (`torch.bfloat16`), which can be more efficient on certain hardware.
- **PowerSGD Hook**: An advanced gradient compression algorithm that provides high compression rates and can accelerate bandwidth-bound distributed training.

In this tutorial, you will see how to quickly set up DDP communication hooks and perform training with the utilities provided in Accelerate, which can be as simple as adding just one new line of code! This demonstrates how to use DDP communication hooks to optimize gradient communication in distributed training with the Accelerate library.

## FP16 Compression Hook

<hfoptions id="fp16">
<hfoption id="PyTorch">

```python
import torch
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed.algorithms.ddp_comm_hooks import default_hooks
from accelerate.test_utils.testing import get_backend

device_type, _, _ = get_backend()
device_id = getattr(torch, device_type, torch.cuda).current_device()

class MyModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.layer = torch.nn.Linear(10, 10)

    def forward(self, x):
        return self.layer(x)

model = MyModel()
model = DDP(model, device_ids=[device_id])
model.register_comm_hook(state=None, hook=default_hooks.fp16_compress_hook)

# Training loop
for data, targets in data_loader:
    outputs = model(data)
    loss = criterion(outputs, targets)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
```

</hfoption>
<hfoption id="Accelerate">

```python
from accelerate import Accelerator, DDPCommunicationHookType, DistributedDataParallelKwargs
import torch

class MyModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.layer = torch.nn.Linear(10, 10)

    def forward(self, x):
        return self.layer(x)

# DDP Communication Hook setup
ddp_kwargs = DistributedDataParallelKwargs(comm_hook=DDPCommunicationHookType.FP16)
accelerator = Accelerator(kwargs_handlers=[ddp_kwargs])

model = MyModel()
optimizer = torch.optim.Adam(model.parameters())
data_loader = DataLoader(dataset, batch_size=16)

model, optimizer, data_loader = accelerator.prepare(model, optimizer, data_loader)

# Training loop
for data, targets in data_loader:
    outputs = model(data)
    loss = criterion(outputs, targets)
    accelerator.backward(loss)
    optimizer.step()
    optimizer.zero_grad()
```

</hfoption>
</hfoptions>

### BF16 Compression Hook

<Tip warning={true}>

BF16 Compression Hook API is experimental, and it requires NCCL version later than 2.9.6.

</Tip>

<hfoptions id="bf16">
<hfoption id="PyTorch">

```python
import torch
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed.algorithms.ddp_comm_hooks import default_hooks
from accelerate.test_utils.testing import get_backend

device_type, _, _ = get_backend()
device_id = getattr(torch, device_type, torch.cuda).current_device()

class MyModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.layer = torch.nn.Linear(10, 10)

    def forward(self, x):
        return self.layer(x)

model = MyModel()
model = DDP(model, device_ids=[device_id])
model.register_comm_hook(state=None, hook=default_hooks.bf16_compress_hook)

# Training loop
for data, targets in data_loader:
    outputs = model(data)
    loss = criterion(outputs, targets)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
```

</hfoption>
<hfoption id="Accelerate">

```python
from accelerate import Accelerator, DDPCommunicationHookType, DistributedDataParallelKwargs
import torch

class MyModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.layer = torch.nn.Linear(10, 10)

    def forward(self, x):
        return self.layer(x)

# DDP Communication Hook setup
ddp_kwargs = DistributedDataParallelKwargs(comm_hook=DDPCommunicationHookType.BF16)
accelerator = Accelerator(kwargs_handlers=[ddp_kwargs])

model = MyModel()
optimizer = torch.optim.Adam(model.parameters())
data_loader = DataLoader(dataset, batch_size=16)

model, optimizer, data_loader = accelerator.prepare(model, optimizer, data_loader)

# Training loop
for data, targets in data_loader:
    outputs = model(data)
    loss = criterion(outputs, targets)
    accelerator.backward(loss)
    optimizer.step()
    optimizer.zero_grad()
```

</hfoption>
</hfoptions>

### PowerSGD Hook

<Tip warning={true}>

PowerSGD typically requires extra memory of the same size as the model’s gradients to enable error feedback, which can compensate for biased compressed communication and improve accuracy.

</Tip>

<hfoptions id="powerSGD">
<hfoption id="PyTorch">

```python
import torch
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed.algorithms.ddp_comm_hooks import powerSGD_hook
from accelerate.test_utils.testing import get_backend

device_type, _, _ = get_backend()
device_id = getattr(torch, device_type, torch.cuda).current_device()

class MyModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.layer = torch.nn.Linear(10, 10)

    def forward(self, x):
        return self.layer(x)

model = MyModel()
model = DDP(model, device_ids=[device_id])
state = powerSGD_hook.PowerSGDState(process_group=None)
model.register_comm_hook(state=state, hook=powerSGD_hook.powerSGD_hook)

# Training loop
for data, targets in data_loader:
    outputs = model(data)
    loss = criterion(outputs, targets)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
```

</hfoption>
<hfoption id="Accelerate">

```python
from accelerate import Accelerator, DDPCommunicationHookType, DistributedDataParallelKwargs
import torch

class MyModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.layer = torch.nn.Linear(10, 10)

    def forward(self, x):
        return self.layer(x)

# DDP Communication Hook setup
ddp_kwargs = DistributedDataParallelKwargs(comm_hook=DDPCommunicationHookType.POWER_SGD)
accelerator = Accelerator(kwargs_handlers=[ddp_kwargs])

model = MyModel()
optimizer = torch.optim.Adam(model.parameters())
data_loader = DataLoader(dataset, batch_size=16)

model, optimizer, data_loader = accelerator.prepare(model, optimizer, data_loader)

# Training loop
for data, targets in data_loader:
    outputs = model(data)
    loss = criterion(outputs, targets)
    accelerator.backward(loss)
    optimizer.step()
    optimizer.zero_grad()
```

</hfoption>
</hfoptions>

## DDP Communication Hooks utilities

There are two additional utilities for supporting optional functionalities with the communication hooks.

### comm_wrapper

`comm_wrapper` is an option to wrap a communication hook with additional functionality. For example, it can be used to combine FP16 compression with other communication strategies. Currently supported wrappers are `no`, `fp16`, and `bf16`.

```python
from accelerate import Accelerator, DDPCommunicationHookType, DistributedDataParallelKwargs
import torch

class MyModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.layer = torch.nn.Linear(10, 10)

    def forward(self, x):
        return self.layer(x)

# DDP Communication Hook setup
ddp_kwargs = DistributedDataParallelKwargs(
    comm_hook=DDPCommunicationHookType.POWER_SGD,
    comm_wrapper=DDPCommunicationHookType.FP16
)
accelerator = Accelerator(kwargs_handlers=[ddp_kwargs])

model = MyModel()
optimizer = torch.optim.Adam(model.parameters())
data_loader = DataLoader(dataset, batch_size=16)

model, optimizer, data_loader = accelerator.prepare(model, optimizer, data_loader)

# Training loop
for data, targets in data_loader:
    outputs = model(data)
    loss = criterion(outputs, targets)
    accelerator.backward(loss)
    optimizer.step()
    optimizer.zero_grad()
```

### comm_state_option

`comm_state_option` allows you to pass additional state information required by certain communication hooks. This is particularly useful for stateful hooks like `PowerSGD`, which require maintaining hyperparameters and internal states across training steps. Below is an example showcasing the use of `comm_state_option` with the `PowerSGD` hook.

```python
from accelerate import Accelerator, DDPCommunicationHookType, DistributedDataParallelKwargs
import torch

class MyModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.layer = torch.nn.Linear(10, 10)

    def forward(self, x):
        return self.layer(x)

# DDP Communication Hook setup
ddp_kwargs = DistributedDataParallelKwargs(
    comm_hook=DDPCommunicationHookType.POWER_SGD,
    comm_state_option={"matrix_approximation_rank": 2}
)
accelerator = Accelerator(kwargs_handlers=[ddp_kwargs])

model = MyModel()
optimizer = torch.optim.Adam(model.parameters())
data_loader = DataLoader(dataset, batch_size=16)

model, optimizer, data_loader = accelerator.prepare(model, optimizer, data_loader)

# Training loop
for data, targets in data_loader:
    outputs = model(data)
    loss = criterion(outputs, targets)
    accelerator.backward(loss)
    optimizer.step()
    optimizer.zero_grad()
```

For more advanced usage and additional hooks, refer to the [PyTorch DDP Communication Hooks documentation](https://pytorch.org/docs/stable/ddp_comm_hooks.html).


================================================
FILE: docs/source/usage_guides/deepspeed.md
================================================
<!--Copyright 2022 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# DeepSpeed

[DeepSpeed](https://github.com/deepspeedai/DeepSpeed) implements everything described in the [ZeRO paper](https://huggingface.co/papers/1910.02054). Some of the salient optimizations are:

1. Optimizer state partitioning (ZeRO stage 1)
2. Gradient partitioning (ZeRO stage 2)
3. Parameter partitioning (ZeRO stage 3)
4. Custom mixed precision training handling
5. A range of fast CUDA-extension-based optimizers
6. ZeRO-Offload to CPU and Disk/NVMe
7. Hierarchical partitioning of model parameters (ZeRO++)

ZeRO-Offload has its own dedicated paper: [ZeRO-Offload: Democratizing Billion-Scale Model Training](https://huggingface.co/papers/2101.06840). And NVMe-support is described in the paper [ZeRO-Infinity: Breaking the GPU
Memory Wall for Extreme Scale Deep Learning](https://huggingface.co/papers/2104.07857).

DeepSpeed ZeRO-2 is primarily used only for training, as its features are of no use to inference.

DeepSpeed ZeRO-3 can be used for inference as well since it allows huge models to be loaded on multiple GPUs, which
won't be possible on a single GPU.

Accelerate integrates [DeepSpeed](https://github.com/deepspeedai/DeepSpeed) via 2 options:

1. Integration of the DeepSpeed features via `deepspeed config file` specification in `accelerate config` . You just supply your custom config file or use our template. Most of
   this document is focused on this feature. This supports all the core features of DeepSpeed and gives user a lot of flexibility.
   User may have to change a few lines of code depending on the config.
2. Integration via `deepspeed_plugin`.This supports subset of the DeepSpeed features and uses default options for the rest of the configurations.
   User need not change any code and is good for those who are fine with most of the default settings of DeepSpeed.

## What is integrated?

Training:

1. Accelerate integrates all features of DeepSpeed ZeRO. This includes all the ZeRO stages 1, 2 and 3 as well as ZeRO-Offload, ZeRO-Infinity (which can offload to disk/NVMe) and ZeRO++.
Below is a short description of Data Parallelism using ZeRO - Zero Redundancy Optimizer along with diagram from this [blog post](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/)
![ZeRO Data Parallelism](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-zero.png)

(Source: [link](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/))

 a. **Stage 1** : Shards optimizer states across data parallel workers/GPUs

 b. **Stage 2** : Shards optimizer states + gradients across data parallel workers/GPUs

 c. **Stage 3**: Shards optimizer states + gradients + model parameters across data parallel workers/GPUs

 d. **Optimizer Offload**: Offloads the gradients + optimizer states to CPU/Disk building on top of ZERO Stage 2

 e. **Param Offload**: Offloads the model parameters to CPU/Disk building on top of ZERO Stage 3

 f. **Hierarchical Partitioning**: Enables efficient multi-node training with data-parallel training across nodes and ZeRO-3 sharding within a node, built on top of ZeRO Stage 3.

<u>Note</u>: With respect to Disk Offload, the disk should be an NVME for decent speed but it technically works on any Disk

Inference:

1. DeepSpeed ZeRO Inference supports ZeRO stage 3 with ZeRO-Infinity. It uses the same ZeRO protocol as training, but
   it doesn't use an optimizer and a lr scheduler and only stage 3 is relevant. For more details see:
   [deepspeed-zero-inference](#deepspeed-zero-inference).


## How it works?

**Pre-Requisites**: Install DeepSpeed version >=0.6.5. Please refer to the [DeepSpeed Installation details](https://github.com/deepspeedai/DeepSpeed#installation)
for more information.

We will first look at easy to use integration via `accelerate config`.
Followed by more flexible and feature rich `deepspeed config file` integration.

### Accelerate DeepSpeed Plugin
On your machine(s) just run:

```bash
accelerate config
```

and answer the questions asked. It will ask whether you want to use a config file for DeepSpeed to which you should answer no. Then answer the following questions to generate a basic DeepSpeed config.
This will generate a config file that will be used automatically to properly set the
default options when doing

```bash
accelerate launch my_script.py --args_to_my_script
```

For instance, here is how you would run the NLP example `examples/nlp_example.py` (from the root of the repo) with DeepSpeed Plugin:

**ZeRO Stage-2 DeepSpeed Plugin Example**
```bash
compute_environment: LOCAL_MACHINE
deepspeed_config:
 gradient_accumulation_steps: 1
 gradient_clipping: 1.0
 offload_optimizer_device: none
 offload_param_device: none
 zero3_init_flag: true
 zero_stage: 2
distributed_type: DEEPSPEED
fsdp_config: {}
machine_rank: 0
main_process_ip: null
main_process_port: null
main_training_function: main
mixed_precision: fp16
num_machines: 1
num_processes: 2
use_cpu: false
```

```bash
accelerate launch examples/nlp_example.py --mixed_precision fp16
```

**ZeRO Stage-3 with CPU Offload DeepSpeed Plugin Example**
```bash
compute_environment: LOCAL_MACHINE
deepspeed_config:
  gradient_accumulation_steps: 1
  gradient_clipping: 1.0
  offload_optimizer_device: cpu
  offload_param_device: cpu
  zero3_init_flag: true
  zero3_save_16bit_model: true
  zero_stage: 3
distributed_type: DEEPSPEED
fsdp_config: {}
machine_rank: 0
main_process_ip: null
main_process_port: null
main_training_function: main
mixed_precision: fp16
num_machines: 1
num_processes: 2
use_cpu: false
```

```bash
accelerate launch examples/nlp_example.py --mixed_precision fp16
```

Currently, `Accelerate` supports following config through the CLI:

```bash
`zero_stage`: [0] Disabled, [1] optimizer state partitioning, [2] optimizer+gradient state partitioning and [3] optimizer+gradient+parameter partitioning
`gradient_accumulation_steps`: Number of training steps to accumulate gradients before averaging and applying them.
`gradient_clipping`: Enable gradient clipping with value.
`offload_optimizer_device`: [none] Disable optimizer offloading, [cpu] offload optimizer to CPU, [nvme] offload optimizer to NVMe SSD. Only applicable with ZeRO >= Stage-2.
`offload_optimizer_nvme_path`: Decides Nvme Path to offload optimizer states. If unspecified, will default to 'none'.
`offload_param_device`: [none] Disable parameter offloading, [cpu] offload parameters to CPU, [nvme] offload parameters to NVMe SSD. Only applicable with ZeRO Stage-3.
`offload_param_nvme_path`: Decides Nvme Path to offload parameters. If unspecified, will default to 'none'.
`zero3_init_flag`: Decides whether to enable `deepspeed.zero.Init` for constructing massive models. Only applicable with ZeRO Stage-3.
`zero3_save_16bit_model`: Decides whether to save 16-bit model weights when using ZeRO Stage-3.
`mixed_precision`: `no` for FP32 training, `fp16` for FP16 mixed-precision training and `bf16` for BF16 mixed-precision training.
`deepspeed_moe_layer_cls_names`: Comma-separated list of transformer Mixture-of-Experts (MoE) layer class names (case-sensitive) to wrap ,e.g, `MixtralSparseMoeBlock`, `Qwen2MoeSparseMoeBlock`, `JetMoEAttention,JetMoEBlock` ...
`deepspeed_hostfile`: DeepSpeed hostfile for configuring multi-node compute resources.
`deepspeed_exclusion_filter`: DeepSpeed exclusion filter string when using mutli-node setup.
`deepspeed_inclusion_filter`: DeepSpeed inclusion filter string when using mutli-node setup.
`deepspeed_multinode_launcher`: DeepSpeed multi-node launcher to use, e.g. `pdsh`, `standard`, `openmpi`, `mvapich`, `mpich`, `slurm`, `nossh` (requires DeepSpeed >= 0.14.5). If unspecified, will default to `pdsh`.
`deepspeed_config_file`: path to the DeepSpeed config file in `json` format. See the next section for more details on this.
```
To be able to tweak more options, you will need to use a DeepSpeed config file.

### DeepSpeed Config File
On your machine(s) just run:

```bash
accelerate config
```

and answer the questions asked. It will ask whether you want to use a config file for deepspeed to which you answer yes
and provide the path to the deepspeed config file.
This will generate a config file that will be used automatically to properly set the
default options when doing

```bash
accelerate launch my_script.py --args_to_my_script
```

For instance, here is how you would run the NLP example `examples/by_feature/deepspeed_with_config_support.py` (from the root of the repo) with DeepSpeed Config File:

**ZeRO Stage-2 DeepSpeed Config File Example**
```bash
compute_environment: LOCAL_MACHINE
deepspeed_config:
 deepspeed_config_file: /home/ubuntu/accelerate/examples/deepspeed_config_templates/zero_stage2_config.json
 zero3_init_flag: true
distributed_type: DEEPSPEED
fsdp_config: {}
machine_rank: 0
main_process_ip: null
main_process_port: null
main_training_function: main
mixed_precision: fp16
num_machines: 1
num_processes: 2
use_cpu: false
```

with the contents of `zero_stage2_config.json` being:
```json
{
    "fp16": {
        "enabled": true,
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },
    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "weight_decay": "auto",
            "torch_adam": true,
            "adam_w_mode": true
        }
    },
    "scheduler": {
        "type": "WarmupDecayLR",
        "params": {
            "warmup_min_lr": "auto",
            "warmup_max_lr": "auto",
            "warmup_num_steps": "auto",
            "total_num_steps": "auto"
        }
    },
    "zero_optimization": {
        "stage": 2,
        "allgather_partitions": true,
        "allgather_bucket_size": 2e8,
        "overlap_comm": true,
        "reduce_scatter": true,
        "reduce_bucket_size": "auto",
        "contiguous_gradients": true
    },
    "gradient_accumulation_steps": 1,
    "gradient_clipping": "auto",
    "steps_per_print": 2000,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false
}
```

```bash
accelerate launch examples/by_feature/deepspeed_with_config_support.py \
--config_name "gpt2-large" \
--tokenizer_name "gpt2-large" \
--dataset_name "wikitext" \
--dataset_config_name "wikitext-2-raw-v1" \
--block_size 128 \
--output_dir "./clm/clm_deepspeed_stage2_accelerate" \
--learning_rate 5e-4 \
--per_device_train_batch_size 24 \
--per_device_eval_batch_size 24 \
--num_train_epochs 3 \
--with_tracking \
--report_to "wandb"\
```

**ZeRO Stage-3 with CPU offload DeepSpeed Config File Example**
```bash
compute_environment: LOCAL_MACHINE
deepspeed_config:
 deepspeed_config_file: /home/ubuntu/accelerate/examples/deepspeed_config_templates/zero_stage3_offload_config.json
 zero3_init_flag: true
distributed_type: DEEPSPEED
fsdp_config: {}
machine_rank: 0
main_process_ip: null
main_process_port: null
main_training_function: main
mixed_precision: fp16
num_machines: 1
num_processes: 2
use_cpu: false
```
with the contents of `zero_stage3_offload_config.json` being:
```json
{
    "fp16": {
        "enabled": true,
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },
    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "weight_decay": "auto"
        }
    },
    "scheduler": {
        "type": "WarmupDecayLR",
        "params": {
            "warmup_min_lr": "auto",
            "warmup_max_lr": "auto",
            "warmup_num_steps": "auto",
            "total_num_steps": "auto"
        }
    },
    "zero_optimization": {
        "stage": 3,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": true
        },
        "offload_param": {
            "device": "cpu",
            "pin_memory": true
        },
        "overlap_comm": true,
        "contiguous_gradients": true,
        "reduce_bucket_size": "auto",
        "stage3_prefetch_bucket_size": "auto",
        "stage3_param_persistence_threshold": "auto",
        "sub_group_size": 1e9,
        "stage3_max_live_parameters": 1e9,
        "stage3_max_reuse_distance": 1e9,
        "stage3_gather_16bit_weights_on_model_save": "auto"
    },
    "gradient_accumulation_steps": 1,
    "gradient_clipping": "auto",
    "steps_per_print": 2000,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false
}
```

```bash
accelerate launch examples/by_feature/deepspeed_with_config_support.py \
--config_name "gpt2-large" \
--tokenizer_name "gpt2-large" \
--dataset_name "wikitext" \
--dataset_config_name "wikitext-2-raw-v1" \
--block_size 128 \
--output_dir "./clm/clm_deepspeed_stage3_offload_accelerate" \
--learning_rate 5e-4 \
--per_device_train_batch_size 32 \
--per_device_eval_batch_size 32 \
--num_train_epochs 3 \
--with_tracking \
--report_to "wandb"\
```

**ZeRO++ Config Example**
You can use the features of ZeRO++ by using the appropriate config parameters. Note that ZeRO++ is an extension for ZeRO Stage 3. Here is how the config file can be modified, from [DeepSpeed's ZeRO++ tutorial](https://www.deepspeed.ai/tutorials/zeropp/):

```json
{
    "zero_optimization": {
        "stage": 3,
        "reduce_bucket_size": "auto",

        "zero_quantized_weights": true,
        "zero_hpz_partition_size": 8,
        "zero_quantized_gradients": true,

        "contiguous_gradients": true,
        "overlap_comm": true
    }
}
```

For hierarchical partitioning, the partition size `zero_hpz_partition_size` should ideally be set to the number of GPUs per node. (For example, the above config file assumes 8 GPUs per node)

**Important code changes when using DeepSpeed Config File**

1. DeepSpeed Optimizers and Schedulers. For more information on these,
see the [DeepSpeed Optimizers](https://deepspeed.readthedocs.io/en/latest/optimizers.html) and [DeepSpeed Schedulers](https://deepspeed.readthedocs.io/en/latest/schedulers.html) documentation.
We will look at the changes needed in the code when using these.

   a. DS Optim + DS Scheduler: The case when both `optimizer` and `scheduler` keys are present in the DeepSpeed config file.
   In this situation, those will be used and the user has to use `accelerate.utils.DummyOptim` and `accelerate.utils.DummyScheduler` to replace the PyTorch/Custom optimizers and schedulers in their code.
   Below is the snippet from `examples/by_feature/deepspeed_with_config_support.py` showing this:
   ```python
    # Creates Dummy Optimizer if `optimizer` was specified in the config file else creates Adam Optimizer
    optimizer_cls = (
        torch.optim.AdamW
        if accelerator.state.deepspeed_plugin is None
        or "optimizer" not in accelerator.state.deepspeed_plugin.deepspeed_config
        else DummyOptim
    )
    optimizer = optimizer_cls(optimizer_grouped_parameters, lr=args.learning_rate)

    # Creates Dummy Scheduler if `scheduler` was specified in the config file else creates `args.lr_scheduler_type` Scheduler
    if (
        accelerator.state.deepspeed_plugin is None
        or "scheduler" not in accelerator.state.deepspeed_plugin.deepspeed_config
    ):
        lr_scheduler = get_scheduler(
            name=args.lr_scheduler_type,
            optimizer=optimizer,
            num_warmup_steps=args.num_warmup_steps,
            num_training_steps=args.max_train_steps,
        )
    else:
        lr_scheduler = DummyScheduler(
            optimizer, total_num_steps=args.max_train_steps, warmup_num_steps=args.num_warmup_steps
        )
   ```
   b. Custom Optim + Custom Scheduler: The case when both `optimizer` and `scheduler` keys are absent in the DeepSpeed config file.
   In this situation, no code changes are needed from the user and this is the case when using integration via DeepSpeed Plugin.
   In the above example we can see that the code remains unchanged if the `optimizer` and `scheduler` keys are absent in the DeepSpeed config file.

   c. Custom Optim + DS Scheduler: The case when only `scheduler` key is present in the DeepSpeed config file.
   In this situation, the user has to use `accelerate.utils.DummyScheduler` to replace the PyTorch/Custom scheduler in their code.

   d. DS Optim + Custom Scheduler: The case when only `optimizer` key is present in the DeepSpeed config file.
   This will result in an error because you can only use DS Scheduler when using DS Optim.

2. Notice the `auto` values in the above example DeepSpeed config files. These are automatically handled by `prepare` method
based on model, dataloaders, dummy optimizer and dummy schedulers provided to `prepare` method.
Only the `auto` fields specified in above examples are handled by `prepare` method and the rest have to be explicitly specified by the user.

The `auto` values are calculated as:

- `reduce_bucket_size`: `hidden_size * hidden_size`
- `stage3_prefetch_bucket_size`: `int(0.9 * hidden_size * hidden_size)`
- `stage3_param_persistence_threshold`: `10 * hidden_size`

For the `auto` feature to work for these 3 config entries - Accelerate will use `model.config.hidden_size` or `max(model.config.hidden_sizes)` as `hidden_size`. If neither of these is available, the launching will fail and you will have to set these 3 config entries manually. Remember the first 2 config entries are the communication buffers - the larger they are the more efficient the comms will be, and the larger they are the more GPU memory they will consume, so it's a tunable performance trade-off.


**Things to note when using DeepSpeed Config File**

Below is a sample script using `deepspeed_config_file` in different scenarios.

Code `test.py`:

```python
from accelerate import Accelerator
from accelerate.state import AcceleratorState


def main():
    accelerator = Accelerator()
    accelerator.print(f"{AcceleratorState()}")


if __name__ == "__main__":
    main()
```

**Scenario 1**: Manually tampered accelerate config file having `deepspeed_config_file` along with other entries.

1. Content of the `accelerate` config:

```yaml
command_file: null
commands: null
compute_environment: LOCAL_MACHINE
deepspeed_config:
  gradient_accumulation_steps: 1
  gradient_clipping: 1.0
  offload_optimizer_device: 'cpu'
  offload_param_device: 'cpu'
  zero3_init_flag: true
  zero3_save_16bit_model: true
  zero_stage: 3
  deepspeed_config_file: 'ds_config.json'
distributed_type: DEEPSPEED
downcast_bf16: 'no'
dynamo_backend: 'NO'
fsdp_config: {}
gpu_ids: null
machine_rank: 0
main_process_ip: null
main_process_port: null
main_training_function: main
megatron_lm_config: {}
num_machines: 1
num_processes: 2
rdzv_backend: static
same_network: true
tpu_name: null
tpu_zone: null
use_cpu: false
```

2. `ds_config.json`:

```json
{
    "bf16": {
        "enabled": true
    },
    "zero_optimization": {
        "stage": 3,
        "stage3_gather_16bit_weights_on_model_save": false,
        "offload_optimizer": {
            "device": "none"
        },
        "offload_param": {
            "device": "none"
        }
    },
    "gradient_clipping": 1.0,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "gradient_accumulation_steps": 10,
    "steps_per_print": 2000000
}
```

3. Output of `accelerate launch test.py`:

```bash
ValueError: When using `deepspeed_config_file`, the following accelerate config variables will be ignored:
['gradient_accumulation_steps', 'gradient_clipping', 'zero_stage', 'offload_optimizer_device', 'offload_param_device',
'zero3_save_16bit_model', 'mixed_precision'].
Please specify them appropriately in the DeepSpeed config file.
If you are using an accelerate config file, remove other config variables mentioned in the above specified list.
The easiest method is to create a new config following the questionnaire via `accelerate config`.
It will only ask for the necessary config variables when using `deepspeed_config_file`.
```

**Scenario 2**: Use the solution of the error to create new accelerate config and check that no ambiguity error is now thrown.

1. Run `accelerate config`:

```bash
$ accelerate config
-------------------------------------------------------------------------------------------------------------------------------
In which compute environment are you running?
This machine
-------------------------------------------------------------------------------------------------------------------------------
Which type of machine are you using?
multi-GPU
How many different machines will you use (use more than 1 for multi-node training)? [1]:
Do you wish to optimize your script with torch dynamo?[yes/NO]:
Do you want to use DeepSpeed? [yes/NO]: yes
Do you want to specify a json file to a DeepSpeed config? [yes/NO]: yes
Please enter the path to the json DeepSpeed config file: ds_config.json
Do you want to enable `deepspeed.zero.Init` when using ZeRO Stage-3 for constructing massive models? [yes/NO]: yes
How many GPU(s) should be used for distributed training? [1]:4
accelerate configuration saved at ds_config_sample.yaml
```

2. Content of the `accelerate` config:

```yaml
compute_environment: LOCAL_MACHINE
deepspeed_config:
  deepspeed_config_file: ds_config.json
  zero3_init_flag: true
distributed_type: DEEPSPEED
downcast_bf16: 'no'
dynamo_backend: 'NO'
fsdp_config: {}
machine_rank: 0
main_training_function: main
megatron_lm_config: {}
num_machines: 1
num_processes: 4
rdzv_backend: static
same_network: true
use_cpu: false
```

3. Output of `accelerate launch test.py`:

```bash
Distributed environment: DEEPSPEED  Backend: nccl
Num processes: 4
Process index: 0
Local process index: 0
Device: cuda:0
Mixed precision type: bf16
ds_config: {'bf16': {'enabled': True}, 'zero_optimization': {'stage': 3, 'stage3_gather_16bit_weights_on_model_save': False, 'offload_optimizer': {'device': 'none'}, 'offload_param': {'device': 'none'}}, 'gradient_clipping': 1.0, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'gradient_accumulation_steps': 10, 'steps_per_print': inf, 'fp16': {'enabled': False}}
```

**Scenario 3**: Setting the `accelerate launch` command arguments related to DeepSpeed as `"auto"` in the DeepSpeed` configuration file and check that things work as expected.

1. New `ds_config.json` with `"auto"` for the `accelerate launch` DeepSpeed command arguments:

```json
{
    "bf16": {
        "enabled": "auto"
    },
    "zero_optimization": {
        "stage": "auto",
        "stage3_gather_16bit_weights_on_model_save": "auto",
        "offload_optimizer": {
            "device": "auto"
        },
        "offload_param": {
            "device": "auto"
        }
    },
    "gradient_clipping": "auto",
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "gradient_accumulation_steps": "auto",
    "steps_per_print": 2000000
}
```

2. Output of `accelerate launch --mixed_precision="fp16" --zero_stage=3 --gradient_accumulation_steps=5 --gradient_clipping=1.0 --offload_param_device="cpu" --offload_optimizer_device="nvme" --zero3_save_16bit_model="true" test.py`:

```bash
Distributed environment: DEEPSPEED  Backend: nccl
Num processes: 4
Process index: 0
Local process index: 0
Device: cuda:0
Mixed precision type: fp16
ds_config: {'bf16': {'enabled': False}, 'zero_optimization': {'stage': 3, 'stage3_gather_16bit_weights_on_model_save': True, 'offload_optimizer': {'device': 'nvme'}, 'offload_param': {'device': 'cpu'}}, 'gradient_clipping': 1.0, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'gradient_accumulation_steps': 5, 'steps_per_print': inf, 'fp16': {'enabled': True, 'auto_cast': True}}
```

**Note**:
1. Remaining `"auto"` values are handled in `accelerator.prepare()` call as explained in point 2 of
`Important code changes when using DeepSpeed Config File`.
2. Only when `gradient_accumulation_steps` is `auto`, the value passed while creating `Accelerator` object via `Accelerator(gradient_accumulation_steps=k)` will be used. When using DeepSpeed Plugin, the value from it will be used and it will overwrite the value passed while creating Accelerator object.

## Saving and loading

1. Saving and loading of models is unchanged for ZeRO Stage-1 and Stage-2.

2. under ZeRO Stage-3, `state_dict` contains just the placeholders since the model weights are partitioned across multiple GPUs.
ZeRO Stage-3 has 2 options:

   a. Saving the entire 16bit model weights to directly load later on using `model.load_state_dict(torch.load(pytorch_model.bin))`.
   For this, either set `zero_optimization.stage3_gather_16bit_weights_on_model_save` to True in DeepSpeed Config file or set
   `zero3_save_16bit_model` to True in DeepSpeed Plugin.
   **Note that this option requires consolidation of the weights on one GPU it can be slow and memory demanding, so only use this feature when needed.**
   Below is the snippet from `examples/by_feature/deepspeed_with_config_support.py` showing this:
   ```python
   unwrapped_model = accelerator.unwrap_model(model)

   # New Code #
   # Saves the whole/unpartitioned fp16 model when in ZeRO Stage-3 to the output directory if
   # `stage3_gather_16bit_weights_on_model_save` is True in DeepSpeed Config file or
   # `zero3_save_16bit_model` is True in DeepSpeed Plugin.
   # For Zero Stages 1 and 2, models are saved as usual in the output directory.
   # The model name saved is `pytorch_model.bin`
   unwrapped_model.save_pretrained(
       args.output_dir,
       is_main_process=accelerator.is_main_process,
       save_function=accelerator.save,
       state_dict=accelerator.get_state_dict(model),
   )
   ```

   b. To get 32bit weights, first save the model using `model.save_checkpoint()`.
   Below is the snippet from `examples/by_feature/deepspeed_with_config_support.py` showing this:
   ```python
   success = model.save_checkpoint(PATH, ckpt_id, checkpoint_state_dict)
   status_msg = f"checkpointing: PATH={PATH}, ckpt_id={ckpt_id}"
   if success:
       logging.info(f"Success {status_msg}")
   else:
       logging.warning(f"Failure {status_msg}")
   ```
   This will create ZeRO model and optimizer partitions along with `zero_to_fp32.py` script in checkpoint directory.
   You can use this script to do offline consolidation.
   It requires no configuration files or GPUs. Here is an example of its usage:
   ```bash
   $ cd /path/to/checkpoint_dir
   $ ./zero_to_fp32.py . pytorch_model.bin
   Processing zero checkpoint at global_step1
   Detected checkpoint of type zero stage 3, world_size: 2
   Saving fp32 state dict to pytorch_model.bin (total_numel=60506624)
   ```
   To get 32bit model for saving/inference, you can perform:
   ```python
   from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint

   unwrapped_model = accelerator.unwrap_model(model)
   fp32_model = load_state_dict_from_zero_checkpoint(unwrapped_model, checkpoint_dir)
   ```
   If you are only interested in the `state_dict`, you can do the following:
   ```python
   from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint

   state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir)
   ```
   Note that all these functions require ~2x memory (general RAM) of the size of the final checkpoint.

## ZeRO Inference
DeepSpeed ZeRO Inference supports ZeRO stage 3 with ZeRO-Infinity.
It uses the same ZeRO protocol as training, but it doesn't use an optimizer and a lr scheduler and only stage 3 is relevant.
With accelerate integration, you just need to prepare the model and dataloader as shown below:

```python
model, eval_dataloader = accelerator.prepare(model, eval_dataloader)
```

## Few caveats to be aware of

1. Current integration doesn’t support Pipeline Parallelism of DeepSpeed.
2. Current integration doesn’t support `mpu`, limiting the tensor parallelism which is supported in Megatron-LM.
3. Current integration doesn’t support multiple models.

## Multi-node DeepSpeed
DeepSpeed supports multi-node inference and training over a variety of different launchers. You can specify a different launcher by setting the `deepspeed_multinode_launcher` config in the CLI or in the DeepSpeed config file.

Currently, accelerate supports passing configuration for the following DeepSpeed multi-node launchers: `pdsh` (default), `standard`, `openmpi`, `mvapich`, `mpich`, `slurm`, `nossh` (requires DeepSpeed >= 0.14.5).

Please read the [DeepSpeed documentation](https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node) for more information on the different launchers. By default, DeepSpeed will attempt to use passwordless SSH from the main machine node to the other nodes to perform the launcher command. In this configuration, the accelerate launch command only needs to be run on the main node. If using the `nossh` launcher, you will need to run the accelerate launch command on every node using copied configuration. 

## DeepSpeed Resources

The documentation for the internals related to deepspeed can be found [here](../package_reference/deepspeed).

- [Project's github](https://github.com/deepspeedai/DeepSpeed)
- [Usage docs](https://www.deepspeed.ai/getting-started/)
- [API docs](https://deepspeed.readthedocs.io/en/latest/index.html)
- [Blog posts](https://www.microsoft.com/en-us/research/search/?q=deepspeed)

Papers:

- [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://huggingface.co/papers/1910.02054)
- [ZeRO-Offload: Democratizing Billion-Scale Model Training](https://huggingface.co/papers/2101.06840)
- [ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning](https://huggingface.co/papers/2104.07857)
- [ZeRO++: Extremely Efficient Collective Communication for Giant Model Training](https://huggingface.co/papers/2306.10209)


Finally, please, remember that `Accelerate` only integrates DeepSpeed, therefore if you
have any problems or questions with regards to DeepSpeed usage, please, file an issue with [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/issues).


<Tip>

    For those interested in the similarities and differences between FSDP and DeepSpeed, please check out the [concept guide here](../concept_guides/fsdp_and_deepspeed)!
    
</Tip>

================================================
FILE: docs/source/usage_guides/deepspeed_multiple_model.md
================================================
<!--Copyright 2024 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Using multiple models with DeepSpeed

<Tip warning={true}>

    This guide assumes that you have read and understood the [DeepSpeed usage guide](./deepspeed.md).

</Tip>

Running multiple models with Accelerate and DeepSpeed is useful for:

* Knowledge distillation
* Post-training techniques like RLHF (see the [TRL](https://github.com/huggingface/trl) library for more examples)
* Training multiple models at once

Currently, Accelerate has a **very experimental API** to help you use multiple models.

This tutorial will focus on two common use cases:

1. Knowledge distillation, where a smaller student model is trained to mimic a larger, better-performing teacher.  If the student model fits on a single GPU, we can use ZeRO-2 for training and ZeRO-3 to shard the teacher for inference. This is significantly faster than using ZeRO-3 for both models.
2. Training multiple *disjoint* models at once.

## Knowledge distillation

Knowledge distillation is a good example of using multiple models, but only training one of them.

Normally, you would use a single [`utils.DeepSpeedPlugin`] for both models. However, in this case, there are two separate configurations. Accelerate allows you to create and use multiple plugins **if and only if** they are in a `dict` so that you can reference and enable the proper plugin when needed.

```python
from accelerate.utils import DeepSpeedPlugin

zero2_plugin = DeepSpeedPlugin(hf_ds_config="zero2_config.json")
zero3_plugin = DeepSpeedPlugin(hf_ds_config="zero3_config.json")

deepspeed_plugins = {"student": zero2_plugin, "teacher": zero3_plugin}
```

The `zero2_config.json` should be configured for full training (so specify `scheduler` and `optimizer` if you are not utilizing your own), while `zero3_config.json` should only be configured for the inference model, as shown in the example below.

```json
{
    "bf16": {
        "enabled": "auto"
    },
    "zero_optimization": {
        "stage": 3,
        "overlap_comm": true,
        "reduce_bucket_size": "auto",
        "stage3_prefetch_bucket_size": "auto",
        "stage3_param_persistence_threshold": "auto",
        "stage3_max_live_parameters": "auto",
        "stage3_max_reuse_distance": "auto",
    },
    "train_micro_batch_size_per_gpu": 1
}
```

An example `zero2_config.json` configuration is shown below.

```json
{
    "bf16": {
        "enabled": "auto"
    },
    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "weight_decay": "auto",
            "torch_adam": true,
            "adam_w_mode": true
        }
    },
    "scheduler": {
        "type": "WarmupLR",
        "params": {
            "warmup_min_lr": "auto",
            "warmup_max_lr": "auto",
            "warmup_num_steps": "auto"
        }
    },
    "zero_optimization": {
        "stage": 2,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": true
        },
    },
    "gradient_accumulation_steps": 1,
    "gradient_clipping": "auto",
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
}
```

<Tip>

    DeepSpeed will raise an error if `train_micro_batch_size_per_gpu` isn't specified, even if this particular model isn't being trained.

</Tip>

From here, create a single [`Accelerator`] and pass in both configurations.

```python
from accelerate import Accelerator

accelerator = Accelerator(deepspeed_plugins=deepspeed_plugins)
```

Now let's see how to use them.

### Student model

By default, Accelerate sets the first item in the `dict` as the default or enabled plugin (`"student"` plugin). Verify this by using the [`utils.deepspeed.get_active_deepspeed_plugin`] function to see which plugin is enabled.

```python
active_plugin = get_active_deepspeed_plugin(accelerator.state)
assert active_plugin is deepspeed_plugins["student"]
```

[`AcceleratorState`] also keeps the active DeepSpeed plugin saved in `state.deepspeed_plugin`.
```python
assert active_plugin is accelerator.deepspeed_plugin
```

Since `student` is the currently active plugin, let's go ahead and prepare the model, optimizer, and scheduler.

```python
student_model, optimizer, scheduler = ...
student_model, optimizer, scheduler, train_dataloader = accelerator.prepare(student_model, optimizer, scheduler, train_dataloader)
```

Now it's time to deal with the teacher model.

### Teacher model

First, you need to specify in [`Accelerator`] that the `zero3_config.json` configuration should be used.

```python
accelerator.state.select_deepspeed_plugin("teacher")
```

This disables the `"student"` plugin and enables the `"teacher"` plugin instead. The
DeepSpeed stateful config inside of Transformers is updated, and it changes which plugin configuration gets called when using
`deepspeed.initialize()`. This allows you to use the automatic `deepspeed.zero.Init`  context manager integration Transformers provides.

```python
teacher_model = AutoModel.from_pretrained(...)
teacher_model = accelerator.prepare(teacher_model)
```

Otherwise, you should manually initialize the model with `deepspeed.zero.Init`.
```python
with deepspeed.zero.Init(accelerator.deepspeed_plugin.config):
    model = MyModel(...)
```

### Training

From here, your training loop can be whatever you like, as long as `teacher_model` is never being trained on.

```python
teacher_model.eval()
student_model.train()
for batch in train_dataloader:
    with torch.no_grad():
        output_teacher = teacher_model(**batch)
    output_student = student_model(**batch)
    # Combine the losses or modify it in some way
    loss = output_teacher.loss + output_student.loss
    accelerator.backward(loss)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
```

## Train multiple disjoint models

Training multiple models is a more complicated scenario.
In its current state, we assume each model is **completely disjointed** from the other during training.

This scenario still requires two [`utils.DeepSpeedPlugin`]'s to be made. However, you also need a second [`Accelerator`], since different `deepspeed` engines are being called at different times. A single [`Accelerator`] can only carry one instance at a time.

Since the [`state.AcceleratorState`] is a stateful object though, it is already aware of both [`utils.DeepSpeedPlugin`]'s available. You can just instantiate a second [`Accelerator`] with no extra arguments.

```python
first_accelerator = Accelerator(deepspeed_plugins=deepspeed_plugins)
second_accelerator = Accelerator()
```

You can call either `first_accelerator.state.select_deepspeed_plugin()` to enable or disable
a particular plugin, and then call [`prepare`].

```python
# can be `accelerator_0`, `accelerator_1`, or by calling `AcceleratorState().select_deepspeed_plugin(...)`
first_accelerator.state.select_deepspeed_plugin("first_model")
first_model = AutoModel.from_pretrained(...)
# For this example, `get_training_items` is a nonexistent function that gets the setup we need for training
first_optimizer, first_scheduler, train_dl, eval_dl = get_training_items(model1)
first_model, first_optimizer, first_scheduler, train_dl, eval_dl = accelerator.prepare(
    first_model, first_optimizer, first_scheduler, train_dl, eval_dl
)

second_accelerator.state.select_deepspeed_plugin("second_model")
second_model = AutoModel.from_pretrained(...)
# For this example, `get_training_items` is a nonexistent function that gets the setup we need for training
second_optimizer, second_scheduler, _, _ = get_training_items(model2)
second_model, second_optimizer, second_scheduler = accelerator.prepare(
    second_model, second_optimizer, second_scheduler
)
```

And now you can train:

```python
for batch in dl:
    outputs1 = first_model(**batch)
    first_accelerator.backward(outputs1.loss)
    first_optimizer.step()
    first_scheduler.step()
    first_optimizer.zero_grad()
    
    outputs2 = model2(**batch)
    second_accelerator.backward(outputs2.loss)
    second_optimizer.step()
    second_scheduler.step()
    second_optimizer.zero_grad()
```

## Resources

To see more examples, please check out the [related tests](https://github.com/huggingface/accelerate/blob/main/src/accelerate/test_utils/scripts/external_deps/test_ds_multiple_model.py) currently in [Accelerate].


================================================
FILE: docs/source/usage_guides/distributed_inference.md
================================================
<!--Copyright 2023 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Distributed inference

Distributed inference can fall into three brackets:

1. Loading an entire model onto each GPU and sending chunks of a batch through each GPU's model copy at a time
2. Loading parts of a model onto each GPU and processing a single input at one time
3. Loading parts of a model onto each GPU and using what is called scheduled Pipeline Parallelism to combine the two prior techniques. 

We're going to go through the first and the last bracket, showcasing how to do each as they are more realistic scenarios.


## Sending chunks of a batch automatically to each loaded model

This is the most memory-intensive solution, as it requires each GPU to keep a full copy of the model in memory at a given time. 

Normally when doing this, users send the model to a specific device to load it from the CPU, and then move each prompt to a different device. 

A basic pipeline using the `diffusers` library might look something like so:

```python
import torch
import torch.distributed as dist
from diffusers import DiffusionPipeline

pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
```
Followed then by performing inference based on the specific prompt:

```python
def run_inference(rank, world_size):
    dist.init_process_group("nccl", rank=rank, world_size=world_size)
    pipe.to(rank)

    if torch.distributed.get_rank() == 0:
        prompt = "a dog"
    elif torch.distributed.get_rank() == 1:
        prompt = "a cat"

    result = pipe(prompt).images[0]
    result.save(f"result_{rank}.png")
```
One will notice how we have to check the rank to know what prompt to send, which can be a bit tedious.

A user might then also think that with Accelerate, using the `Accelerator` to prepare a dataloader for such a task might also be 
a simple way to manage this. (To learn more, check out the relevant section in the [Quick Tour](../quicktour#distributed-evaluation))

Can it manage it? Yes. Does it add unneeded extra code however: also yes.


With Accelerate, we can simplify this process by using the [`Accelerator.split_between_processes`] context manager (which also exists in `PartialState` and `AcceleratorState`). 
This function will automatically split whatever data you pass to it (be it a prompt, a set of tensors, a dictionary of the prior data, etc.) across all the processes (with a potential
to be padded) for you to use right away.

Let's rewrite the above example using this context manager:

```python
import torch
from accelerate import PartialState  # Can also be Accelerator or AcceleratorState
from diffusers import DiffusionPipeline

pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
distributed_state = PartialState()
pipe.to(distributed_state.device)

# Assume two processes
with distributed_state.split_between_processes(["a dog", "a cat"]) as prompt:
    result = pipe(prompt).images[0]
    result.save(f"result_{distributed_state.process_index}.png")
```

And then to launch the code, we can use the Accelerate:

If you have generated a config file to be used using `accelerate config`:

```bash
accelerate launch distributed_inference.py
```

If you have a specific config file you want to use:

```bash
accelerate launch --config_file my_config.json distributed_inference.py
```

Or if don't want to make any config files and launch on two GPUs:

> Note: You will get some warnings about values being guessed based on your system. To remove these you can do `accelerate config default` or go through `accelerate config` to create a config file.

```bash
accelerate launch --num_processes 2 distributed_inference.py
```

We've now reduced the boilerplate code needed to split this data to a few lines of code quite easily.

But what if we have an odd distribution of prompts to GPUs? For example, what if we have 3 prompts, but only 2 GPUs? 

Under the context manager, the first GPU would receive the first two prompts and the second GPU the third, ensuring that 
all prompts are split and no overhead is needed.

*However*, what if we then wanted to do something with the results of *all the GPUs*? (Say gather them all and perform some kind of post processing)
You can pass in `apply_padding=True` to ensure that the lists of prompts are padded to the same length, with extra data being taken 
from the last sample. This way all GPUs will have the same number of prompts, and you can then gather the results.

<Tip>

This is only needed when trying to perform an action such as gathering the results, where the data on each device 
needs to be the same length. Basic inference does not require this.

</Tip>

For instance:

```python
import torch
from accelerate import PartialState  # Can also be Accelerator or AcceleratorState
from diffusers import DiffusionPipeline

pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
distributed_state = PartialState()
pipe.to(distributed_state.device)

# Assume two processes
with distributed_state.split_between_processes(["a dog", "a cat", "a chicken"], apply_padding=True) as prompt:
    result = pipe(prompt).images
```

On the first GPU, the prompts will be `["a dog", "a cat"]`, and on the second GPU it will be `["a chicken", "a chicken"]`.
Make sure to drop the final sample, as it will be a duplicate of the previous one.

You can find more complex examples [here](https://github.com/huggingface/accelerate/tree/main/examples/inference/distributed) such as how to use it with LLMs.

## Memory-efficient pipeline parallelism (experimental)

This next part will discuss using *pipeline parallelism*. This is an **experimental** API that utilizes [torch.distributed.pipelining](https://pytorch.org/docs/stable/distributed.pipelining.html#) as a native solution. 

The general idea with pipeline parallelism is: say you have 4 GPUs and a model big enough it can be *split* on four GPUs using `device_map="auto"`. With this method you can send in 4 inputs at a time (for example here, any amount works) and each model chunk will work on an input, then receive the next input once the prior chunk finished, making it *much* more efficient **and faster** than the method described earlier. Here's a visual taken from the PyTorch repository:

![Pipeline parallelism example](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/accelerate/pipeline_parallel.png)

To illustrate how you can use this with Accelerate, we have created an [example zoo](https://github.com/huggingface/accelerate/tree/main/examples/inference) showcasing a number of different models and situations. In this tutorial, we'll show this method for GPT2 across two GPUs.

Before you proceed, please make sure you have the latest PyTorch version installed by running the following:

```bash
pip install torch
```

Start by creating the model on the CPU:

```{python}
from transformers import GPT2ForSequenceClassification, GPT2Config

config = GPT2Config()
model = GPT2ForSequenceClassification(config)
model.eval()
```

Next you'll need to create some example inputs to use. These help `torch.distributed.pipelining` trace the model.

<Tip warning={true}>
    However you make this example will determine the relative batch size that will be used/passed
    through the model at a given time, so make sure to remember how many items there are!
</Tip>

```{python}
input = torch.randint(
    low=0,
    high=config.vocab_size,
    size=(2, 1024),  # bs x seq_len
    device="cpu",
    dtype=torch.int64,
    requires_grad=False,
)
```
Next we need to actually perform the tracing and get the model ready. To do so, use the [`inference.prepare_pippy`] function and it will fully wrap the model for pipeline parallelism automatically:

```{python}
from accelerate.inference import prepare_pippy
example_inputs = {"input_ids": input}
model = prepare_pippy(model, example_args=(input,))
```

<Tip>

    There are a variety of parameters you can pass through to `prepare_pippy`:
    
    * `split_points` lets you determine what layers to split the model at. By default we use wherever `device_map="auto" declares, such as `fc` or `conv1`.

    * `num_chunks` determines how the batch will be split and sent to the model itself (so `num_chunks=1` with four split points/four GPUs will have a naive MP where a single input gets passed between the four layer split points)

</Tip>

From here, all that's left is to actually perform the distributed inference!

<Tip warning={true}>

When passing inputs, we highly recommend to pass them in as a tuple of arguments. Using `kwargs` is supported, however, this approach is experimental.
</Tip>

```{python}
args = some_more_arguments
with torch.no_grad():
    output = model(*args)
```

When finished all the data will be on the last process only:

```{python}
from accelerate import PartialState
if PartialState().is_last_process:
    print(output)
```

<Tip>

    If you pass in `gather_output=True` to [`inference.prepare_pippy`], the output will be sent
    across to all the GPUs afterwards without needing the `is_last_process` check. This is 
    `False` by default as it incurs a communication call.
    
</Tip>

And that's it! To explore more, please check out the inference examples in the [Accelerate repo](https://github.com/huggingface/accelerate/tree/main/examples/inference/pippy) and our [documentation](../package_reference/inference) as we work to improving this integration. 


================================================
FILE: docs/source/usage_guides/explore.md
================================================
<!--Copyright 2022 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Start Here!

Please use the interactive tool below to help you get started with learning about a particular 
feature of Accelerate and how to utilize it! It will provide you with a code diff, an explanation
towards what is going on, as well as provide you with some useful links to explore more within
the documentation!

Most code examples start from the following python code before integrating Accelerate in some way:

```python
for batch in dataloader:
    optimizer.zero_grad()
    inputs, targets = batch
    inputs = inputs.to(device)
    targets = targets.to(device)
    outputs = model(inputs)
    loss = loss_function(outputs, targets)
    loss.backward()
    optimizer.step()
    scheduler.step()
```

<div class="block dark:hidden">
	<iframe 
        src="https://hf-accelerate-accelerate-examples.hf.space?__theme=light"
        width="850"
        height="1600"
    ></iframe>
</div>
<div class="hidden dark:block">
    <iframe 
        src="https://hf-accelerate-accelerate-examples.hf.space?__theme=dark"
        width="850"
        height="1600"
    ></iframe>
</div>


================================================
FILE: docs/source/usage_guides/fsdp.md
================================================
<!--Copyright 2022 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Fully Sharded Data Parallel

To accelerate training huge models on larger batch sizes, we can use a fully sharded data parallel model.
This type of data parallel paradigm enables fitting more data and larger models by sharding the optimizer states, gradients and parameters.
To read more about it and the benefits, check out the [Fully Sharded Data Parallel blog](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/).
We have integrated the latest PyTorch's Fully Sharded Data Parallel (FSDP) training feature.
All you need to do is enable it through the config.

## How it works out of the box

On your machine(s) just run:

```bash
accelerate config
```

and answer the questions asked. This will generate a config file that will be used automatically to properly set the
default options when doing

```bash
accelerate launch my_script.py --args_to_my_script
```

For instance, here is how you would run `examples/nlp_example.py` (from the root of the repo) with FSDP enabled:

```bash
compute_environment: LOCAL_MACHINE
debug: false
distributed_type: FSDP
downcast_bf16: 'no'
fsdp_config:
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_backward_prefetch_policy: BACKWARD_PRE
  fsdp_forward_prefetch: false
  fsdp_cpu_ram_efficient_loading: true
  fsdp_offload_params: false
  fsdp_sharding_strategy: FULL_SHARD
  fsdp_state_dict_type: SHARDED_STATE_DICT
  fsdp_sync_module_states: true
  fsdp_transformer_layer_cls_to_wrap: BertLayer
  fsdp_use_orig_params: true
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 2
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
```

```bash
accelerate launch examples/nlp_example.py
```

Currently, `Accelerate` supports the following config through the CLI:

`fsdp_sharding_strategy`: [1] FULL_SHARD (shards optimizer states, gradients and parameters), [2] SHARD_GRAD_OP (shards optimizer states and gradients), [3] NO_SHARD (DDP), [4] HYBRID_SHARD (shards optimizer states, gradients and parameters within each node while each node has full copy), [5] HYBRID_SHARD_ZERO2 (shards optimizer states and gradients within each node while each node has full copy). For more information, please refer the official [PyTorch docs](https://pytorch.org/docs/stable/fsdp.html#torch.distributed.fsdp.ShardingStrategy).

`fsdp_offload_params` : Decides Whether to offload parameters and gradients to CPU

`fsdp_auto_wrap_policy`: [1] TRANSFORMER_BASED_WRAP, [2] SIZE_BASED_WRAP, [3] NO_WRAP

`fsdp_transformer_layer_cls_to_wrap`: Only applicable for Transformers. When using `fsdp_auto_wrap_policy=TRANSFORMER_BASED_WRAP`, a user may provide a comma-separated string of transformer layer class names (case-sensitive) to wrap, e.g., `BertLayer`, `GPTJBlock`, `T5Block`, `BertLayer,BertEmbeddings,BertSelfOutput`. This is important because submodules that share weights (e.g., embedding layers) should not end up in different FSDP wrapped units. Using this policy, wrapping happens for each block containing Multi-Head Attention followed by a couple of MLP layers. Remaining layers including the shared embeddings are conveniently wrapped in same outermost FSDP unit. Therefore, use this for transformer-based models. You can use the `model._no_split_modules` for Transformer models by answering `yes` to `Do you want to use the model's `_no_split_modules` to wrap. It will try to use `model._no_split_modules` when possible.

`fsdp_min_num_params`: minimum number of parameters when using `fsdp_auto_wrap_policy=SIZE_BASED_WRAP`.

`fsdp_backward_prefetch_policy`: [1] BACKWARD_PRE, [2] BACKWARD_POST, [3] NO_PREFETCH

`fsdp_forward_prefetch`: if True, then FSDP explicitly prefetches the next upcoming all-gather while executing in the forward pass. Should only be used for static-graph models since the prefetching follows the first iteration’s execution order. i.e., if the sub-modules' order changes dynamically during the model's execution do not enable this feature.

`fsdp_state_dict_type`: [1] FULL_STATE_DICT, [2] LOCAL_STATE_DICT, [3] SHARDED_STATE_DICT

`fsdp_use_orig_params`: If True, allows non-uniform `requires_grad` during init, which means support for interspersed frozen and trainable parameters. This setting is useful in cases such as parameter-efficient fine-tuning as discussed in [this post](https://dev-discuss.pytorch.org/t/rethinking-pytorch-fully-sharded-data-parallel-fsdp-from-first-principles/1019). This option also allows one to have multiple optimizer param groups. This should be `True` when creating an optimizer before preparing/wrapping the model with FSDP.

`fsdp_cpu_ram_efficient_loading`: Only applicable for Transformers models. If True, only the first process loads the pretrained model checkpoint while all other processes have empty weights. This should be set to False if you experience errors when loading the pretrained Transformers model via `from_pretrained` method. When this setting is True `fsdp_sync_module_states` also must to be True, otherwise all the processes except the main process would have random weights leading to unexpected behaviour during training. For this to work, make sure the distributed process group is initialized before calling Transformers `from_pretrained` method. When using Trainer API, the distributed process group is initialized when you create an instance of `TrainingArguments` class.

`fsdp_sync_module_states`: If True, each individually wrapped FSDP unit will broadcast module parameters from rank 0.


For additional and more nuanced control, you can specify other FSDP parameters via `FullyShardedDataParallelPlugin`.
When creating `FullyShardedDataParallelPlugin` object, pass it the parameters that weren't part of the accelerate config or if you want to override them.
The FSDP parameters will be picked based on the accelerate config file or launch command arguments and other parameters that you will pass directly through the `FullyShardedDataParallelPlugin` object will set/override that.

Below is an example:

```py
from accelerate import FullyShardedDataParallelPlugin
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig

fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=False, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=False, rank0_only=False),
)

accelerator = Accelerator(fsdp_plugin=fsdp_plugin)
```

## Saving and loading

The new recommended way of checkpointing when using FSDP models is to use `SHARDED_STATE_DICT` as `StateDictType` when setting up the accelerate config.
Below is the code snippet to save using `save_state` utility of accelerate.

```py
accelerator.save_state("ckpt")
```

Inspect the checkpoint folder to see model and optimizer as shards per process:
```
ls ckpt
# optimizer_0  pytorch_model_0  random_states_0.pkl  random_states_1.pkl  scheduler.bin

cd ckpt

ls optimizer_0
# __0_0.distcp  __1_0.distcp

ls pytorch_model_0
# __0_0.distcp  __1_0.distcp
```

To load them back for resuming the training, use the `load_state` utility of accelerate

```py
accelerator.load_state("ckpt")
```

When using transformers `save_pretrained`, pass `state_dict=accelerator.get_state_dict(model)` to save the model state dict.
  Below is an example:

```diff
  unwrapped_model.save_pretrained(
      args.output_dir,
      is_main_process=accelerator.is_main_process,
      save_function=accelerator.save,
+     state_dict=accelerator.get_state_dict(model),
)
```

### State Dict

`accelerator.get_state_dict` will call the underlying `model.state_dict` implementation using `FullStateDictConfig(offload_to_cpu=True, rank0_only=True)` context manager to get the state dict only for rank 0 and it will be offloaded to CPU.

You can then pass `state` into the `save_pretrained` method.  There are several modes for `StateDictType` and `FullStateDictConfig` that you can use to control the behavior of `state_dict`.  For more information, see the [PyTorch documentation](https://pytorch.org/docs/stable/fsdp.html).

If you choose to use `StateDictType.SHARDED_STATE_DICT`, the weights of the model during `Accelerator.save_state` will be split into `n` files for each sub-split on the model. To merge them back into
a single dictionary to load back into the model later after training you can use the `merge_weights` utility:

```py
from accelerate.utils import merge_fsdp_weights

# Our weights are saved usually in a `pytorch_model_fsdp_{model_number}` folder
merge_fsdp_weights("pytorch_model_fsdp_0", "output_path", safe_serialization=True)
```
The final output will then either be saved to `model.safetensors` or `pytorch_model.bin` (if `safe_serialization=False` is passed). 

This can also be called using the CLI:
```bash
accelerate merge-weights pytorch_model_fsdp_0/ output_path
```


## Mapping between FSDP sharding strategies and DeepSpeed ZeRO Stages
* `FULL_SHARD` maps to the DeepSpeed `ZeRO Stage-3`. Shards optimizer states, gradients and parameters.
* `SHARD_GRAD_OP` maps to the DeepSpeed `ZeRO Stage-2`. Shards optimizer states and gradients.
* `NO_SHARD` maps to `ZeRO Stage-0`. No sharding wherein each GPU has full copy of model, optimizer states and gradients.
* `HYBRID_SHARD` maps to `ZeRO++ Stage-3` wherein `zero_hpz_partition_size=<num_gpus_per_node>`. Here, this will shard optimizer states, gradients and parameters within each node while each node has full copy.

## A few caveats to be aware of

- In case of multiple models, pass the optimizers to the prepare call in the same order as corresponding models else `accelerator.save_state()` and `accelerator.load_state()` will result in wrong/unexpected behaviour.
- This feature is incompatible with `--predict_with_generate` in the `run_translation.py` script of `Transformers` library.

For more control, users can leverage the `FullyShardedDataParallelPlugin`. After creating an instance of this class, users can pass it to the Accelerator class instantiation.
For more information on these options, please refer to the PyTorch [FullyShardedDataParallel](https://github.com/pytorch/pytorch/blob/0df2e863fbd5993a7b9e652910792bd21a516ff3/torch/distributed/fsdp/fully_sharded_data_parallel.py#L236) code.


<Tip>

    For those interested in the similarities and differences between FSDP and DeepSpeed, please check out the [concept guide here](../concept_guides/fsdp_and_deepspeed)!
    
</Tip>

================================================
FILE: docs/source/usage_guides/gaudi.md
================================================
<!--Copyright 2025 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Intel Gaudi

Users can take advantage of Intel Gaudi AI accelerators for significantly faster and cost-effective model training and inference.
The Intel Gaudi AI accelerator family currently includes three product generations: [Intel Gaudi 1](https://habana.ai/products/gaudi/), [Intel Gaudi 2](https://habana.ai/products/gaudi2/), and [Intel Gaudi 3](https://habana.ai/products/gaudi3/). Each server is equipped with 8 devices, known as Habana Processing Units (HPUs), providing 128GB of memory on Gaudi 3, 96GB on Gaudi 2, and 32GB on the first-gen Gaudi. For more details on the underlying hardware architecture, check out the [Gaudi Architecture Overview](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html).

## How it works out of the box

It is enabled by default if an Intel Gaudi device is detected.
To disable it, pass `--cpu` flag to `accelerate launch` command or answer the corresponding question when answering the `accelerate config` questionnaire.

You can directly run the following script to test it out on Intel Gaudi:

```bash
accelerate launch /examples/cv_example.py --data_dir images
```

## Limitations

The following features are not part of the Accelerate library and requires [Optimum for Intel Gaudi](https://huggingface.co/docs/optimum/main/en/habana/index):

- `fast_ddp` which implements DDP by applying an all-reduce on gradients instead of the Torch DDP wrapper.
- `minimize_memory` which is used for fp8 training and enables keeping fp8 weights in memory between the forward and backward passes, leading to a smaller memory footprint at the cost of additional fp8 casts.
- `context_parallel_size` which is used for Context/Sequence Parallelism (CP/SP) and partitions the network inputs and activations along sequence dimension to reduce memory footprint and increase throughput.


================================================
FILE: docs/source/usage_guides/gradient_accumulation.md
================================================
<!--Copyright 2022 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Performing gradient accumulation with Accelerate

Gradient accumulation is a technique where you can train on bigger batch sizes than 
your machine would normally be able to fit into memory. This is done by accumulating gradients over
several batches, and only stepping the optimizer after a certain number of batches have been performed.

While technically standard gradient accumulation code would work fine in a distributed setup, it is not the most efficient
method for doing so and you may experience considerable slowdowns!

In this tutorial you will see how to quickly setup gradient accumulation and perform it with the utilities provided in Accelerate,
which can total to adding just one new line of code!

This example will use a very simplistic PyTorch training loop that performs gradient accumulation every two batches:

```python
device = "cuda"
model.to(device)

gradient_accumulation_steps = 2

for index, batch in enumerate(training_dataloader):
    inputs, targets = batch
    inputs = inputs.to(device)
    targets = targets.to(device)
    outputs = model(inputs)
    loss = loss_function(outputs, targets)
    loss = loss / gradient_accumulation_steps
    loss.backward()
    if (index + 1) % gradient_accumulation_steps == 0:
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
```

## Converting it to Accelerate

First the code shown earlier will be converted to utilize Accelerate without the special gradient accumulation helper:

```diff
+ from accelerate import Accelerator
+ accelerator = Accelerator()

+ model, optimizer, training_dataloader, scheduler = accelerator.prepare(
+     model, optimizer, training_dataloader, scheduler
+ )

  for index, batch in enumerate(training_dataloader):
      inputs, targets = batch
-     inputs = inputs.to(device)
-     targets = targets.to(device)
      outputs = model(inputs)
      loss = loss_function(outputs, targets)
      loss = loss / gradient_accumulation_steps
+     accelerator.backward(loss)
      if (index+1) % gradient_accumulation_steps == 0:
          optimizer.step()
          scheduler.step()
          optimizer.zero_grad()
```

<Tip warning={true}>

  In its current state, this code is not going to perform gradient accumulation efficiently due to a process called gradient synchronization. Read more about that in the [Concepts tutorial](../concept_guides/gradient_synchronization)!

</Tip>

## Letting Accelerate handle gradient accumulation

All that is left now is to let Accelerate handle the gradient accumulation for us. To do so you should pass in a `gradient_accumulation_steps` parameter to [`Accelerator`], dictating the number 
of steps to perform before each call to `step()` and how to automatically adjust the loss during the call to [`~Accelerator.backward`]:

```diff
  from accelerate import Accelerator
- accelerator = Accelerator()
+ accelerator = Accelerator(gradient_accumulation_steps=2)
```

Alternatively, you can pass in a `gradient_accumulation_plugin` parameter to the [`Accelerator`] object's `__init__`, which will allow you to further customize the gradient accumulation behavior. 
Read more about that in the [GradientAccumulationPlugin](../package_reference/accelerator#accelerate.utils.GradientAccumulationPlugin) docs.

From here you can use the [`~Accelerator.accumulate`] context manager from inside your training loop to automatically perform the gradient accumulation for you!
You just wrap it around the entire training part of our code: 

```diff
- for index, batch in enumerate(training_dataloader):
+ for batch in training_dataloader:
+     with accelerator.accumulate(model):
          inputs, targets = batch
          outputs = model(inputs)
```

You can remove all the special checks for the step number and the loss adjustment:

```diff
- loss = loss / gradient_accumulation_steps
  accelerator.backward(loss)
- if (index+1) % gradient_accumulation_steps == 0:
  optimizer.step()
  scheduler.step()
  optimizer.zero_grad()
```

As you can see the [`Accelerator`] is able to keep track of the batch number you are on and it will automatically know whether to step through the prepared optimizer and how to adjust the loss. 

<Tip>

Typically with gradient accumulation, you would need to adjust the number of steps to reflect the change in total batches you are 
training on. Accelerate automagically does this for you by default. Behind the scenes we instantiate a [`GradientAccumulationPlugin`] configured to do this.

</Tip>

<Tip warning={true}>

The [`state.GradientState`] is sync'd with the active dataloader being iterated upon. As such it assumes naively that when we have reached the end of the dataloader everything will sync and a step will be performed. To disable this, set `sync_with_dataloader` to be `False` in the [`GradientAccumulationPlugin`]:

```{python}
from accelerate import Accelerator
from accelerate.utils import GradientAccumulationPlugin

plugin = GradientAccumulationPlugin(sync_with_dataloader=False)
accelerator = Accelerator(..., gradient_accumulation_plugin=plugin)
```

</Tip>

## The finished code

Below is the finished implementation for performing gradient accumulation with Accelerate

```python
from accelerate import Accelerator
accelerator = Accelerator(gradient_accumulation_steps=2)
model, optimizer, training_dataloader, scheduler = accelerator.prepare(
    model, optimizer, training_dataloader, scheduler
)
for batch in training_dataloader:
    with accelerator.accumulate(model):
        inputs, targets = batch
        outputs = model(inputs)
        loss = loss_function(outputs, targets)
        accelerator.backward(loss)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
```

<Tip warning={true}>

It's important that **only one forward/backward** should be done inside the context manager `with accelerator.accumulate(model)`.

</Tip>


To learn more about what magic this wraps around, read the [Gradient Synchronization concept guide](../concept_guides/gradient_synchronization)


## Self-contained example

Here is a self-contained example that you can run to see gradient accumulation in action with Accelerate:

```python
import torch
import copy
from accelerate import Accelerator
from accelerate.utils import set_seed
from torch.utils.data import TensorDataset, DataLoader

# seed
set_seed(0)

# define toy inputs and labels
x = torch.tensor([1., 2., 3., 4., 5., 6., 7., 8.])
y = torch.tensor([2., 4., 6., 8., 10., 12., 14., 16.])
gradient_accumulation_steps = 4
per_device_batch_size = len(x) // gradient_accumulation_steps

# define dataset and dataloader
dataset = TensorDataset(x, y)
dataloader = DataLoader(dataset, batch_size=per_device_batch_size)

# define model, optimizer and loss function
class SimpleLinearModel(torch.nn.Module):
    def __init__(self):
        super(SimpleLinearModel, self).__init__()
        self.weight = torch.nn.Parameter(torch.zeros((1, 1)))

    def forward(self, inputs):
        return inputs @ self.weight

model = SimpleLinearModel()
model_clone = copy.deepcopy(model)
criterion = torch.nn.MSELoss()
model_optimizer = torch.optim.SGD(model.parameters(), lr=0.02)
accelerator = Accelerator(gradient_accumulation_steps=gradient_accumulation_steps)
model, model_optimizer, dataloader = accelerator.prepare(model, model_optimizer, dataloader)
model_clone_optimizer = torch.optim.SGD(model_clone.parameters(), lr=0.02)
print(f"initial model weight is {model.weight.mean().item():.5f}")
print(f"initial model weight is {model_clone.weight.mean().item():.5f}")
for i, (inputs, labels) in enumerate(dataloader):
    with accelerator.accumulate(model):
        inputs = inputs.view(-1, 1)
        print(i, inputs.flatten())
        labels = labels.view(-1, 1)
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        accelerator.backward(loss)
        model_optimizer.step()
        model_optimizer.zero_grad()
loss = criterion(x.view(-1, 1) @ model_clone.weight, y.view(-1, 1))
model_clone_optimizer.zero_grad()
loss.backward()
model_clone_optimizer.step()
print(f"w/ accumulation, the final model weight is {model.weight.mean().item():.5f}")
print(f"w/o accumulation, the final model weight is {model_clone.weight.mean().item():.5f}")
```
```
initial model weight is 0.00000
initial model weight is 0.00000
0 tensor([1., 2.])
1 tensor([3., 4.])
2 tensor([5., 6.])
3 tensor([7., 8.])
w/ accumulation, the final model weight is 2.04000
w/o accumulation, the final model weight is 2.04000
```

## Gradient accumulation on training samples of variable size

As was pointed out in this [blog-post](https://huggingface.co/blog/gradient_accumulation), which points out a common error that occurs when performing gradient accumulation on training samples of variable size:

>  [...] for gradient accumulation across token-level tasks like causal LM training, the correct loss should be computed by the **total loss across all batches in a gradient accumulation step** divided by the **total number of all non padding tokens in those batches**. This is not the same as the average of the per-batch loss values. 

In other words, some adjustments must be made on losses that operate on a token-level basis.

### Skeleton code

```python
from accelerate import Accelerator
import math
import contextlib

gradient_accumulation_steps = 2
accelerator = Accelerator(gradient_accumulation_steps=gradient_accumulation_steps)
model, optimizer, training_dataloader, scheduler = accelerator.prepare(
    model, optimizer, training_dataloader, scheduler
)

training_iterator = iter(training_dataloader)
num_samples_in_epoch = len(training_dataloader)
remainder = num_samples_in_epoch % gradient_accumulation_steps
remainder = remainder if remainder != 0 else gradient_accumulation_steps
total_updates = math.ceil(num_samples_in_epoch / gradient_accumulation_steps)
        

total_batched_samples = 0
for update_step in range(total_updates):
        # In order to correctly the total number of non-padded tokens on which we'll compute the cross-entropy loss
        # we need to pre-load the full local batch - i.e the next per_device_batch_size * accumulation_steps samples
        batch_samples = []
        num_batches_in_step = gradient_accumulation_steps if update_step != (total_updates - 1) else remainder
        for _ in range(num_batches_in_step):
            batch_samples += [next(training_iterator)]
            
        # get local num items in batch 
        num_items_in_batch = sum([(batch["labels"].ne(-100)).sum() for batch in batch_samples])
        # to compute it correctly in a multi-device DDP training, we need to gather the total number of items in the full batch.
        num_items_in_batch = accelerator.gather(num_items_in_batch).sum().item()
            
        for i, batch in enumerate(batch_samples):
            # if we perform gradient accumulation in a multi-devices set-up, we want to avoid unnecessary communications when accumulating
            # cf: https://muellerzr.github.io/blog/gradient_accumulation.html
            if (i < len(batch_samples) - 1 and accelerator.num_processes > 1):
                ctx = model.no_sync
            else:
                ctx = contextlib.nullcontext
            
            total_batched_samples += 1

            with ctx():
                inputs, targets = batch
                outputs = model(inputs)
                loss = loss_function(outputs, targets) # the loss function should sum over samples rather than averaging
                
                # We multiply by num_processes because the DDP calculates the average gradient across all devices whereas dividing by num_items_in_batch already takes into account all devices
                # Same reason for gradient_accumulation_steps, but this times it's Accelerate that calculate the average gradient across the accumulated steps
                loss = (loss * gradient_accumulation_steps * accelerator.num_processes) / num_items_in_batch
                
                accelerator.backward(loss)

        # Sync gradients and perform optimization steps once every gradient_accumulation_steps
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
```

### Self-contained causal LM example

```py
import torch
import copy
from accelerate import Accelerator
from accelerate.utils import set_seed
from accelerate.logging import  get_logger
from torch.utils.data import Dataset, DataLoader
import math
import contexlib

# seed
set_seed(0)
logger = get_logger(__name__)

class MyDataset(Dataset):
    def __init__(self, num_samples):
        super().__init__()
        self.len = num_samples

    def __getitem__(self, index):
        input_ids = torch.arange(1, index+2, dtype=torch.float32)
        labels = torch.remainder(input_ids, 2)
        return {"input_ids": input_ids, "labels": labels}

    def __len__(self):
        return self.len
    
def collate_fn(features):
    input_ids = torch.nn.utils.rnn.pad_sequence([f["input_ids"] for f in features], batch_first=True, padding_value=-100)
    labels = torch.nn.utils.rnn.pad_sequence([f["labels"] for f in features], batch_first=True, padding_value=-100)
    return {"input_ids": input_ids[..., None], "labels": labels[..., None]}

# define toy inputs and labels
gradient_accumulation_steps = 2
per_device_batch_size = 4

# define accelerator
accelerator = Accelerator(gradient_accumulation_steps=gradient_accumulation_steps)

# define dataset and dataloader
# for this toy example, we'll compute gradient descent over one single global batch
dataset = MyDataset(per_device_batch_size*gradient_accumulation_steps*accelerator.num_processes)
dataloader = DataLoader(dataset, batch_size=per_device_batch_size, collate_fn=collate_fn)

# define model, model_optimizer and loss function
model = torch.nn.Linear(1, 2, bias=False)
model_clone = copy.deepcopy(model)
criterion = torch.nn.CrossEntropyLoss(reduction="sum") # must sum over samples rather than averaging
model_optimizer = torch.optim.SGD(model.parameters(), lr=0.08)


logger.warning(f"initial model weight is {model.weight.detach().cpu().squeeze()}")
logger.warning(f"initial model clone weight is {model_clone.weight.detach().cpu().squeeze()}")

# prepare artifacts - accelerator handles device placement and dataloader splitting
model, model_optimizer = accelerator.prepare(model, model_optimizer)
dataloader = accelerator.prepare_data_loader(dataloader, device_placement=True)
training_iterator = iter(dataloader)

num_samples_in_epoch = len(dataloader)
remainder = num_samples_in_epoch % gradient_accumulation_steps
remainder = remainder if remainder != 0 else gradient_accumulation_steps
total_gradient_updates = math.ceil(num_samples_in_epoch / gradient_accumulation_steps)

total_batched_samples = 0
for update_step in range(total_gradient_updates):
        # In order to correctly the total number of non-padded tokens on which we'll compute the cross-entropy loss
        # we need to pre-load the full local batch - i.e the next per_device_batch_size * accumulation_steps samples
        batch_samples = []
        num_batches_in_step = gradient_accumulation_steps if update_step != (total_gradient_updates - 1) else remainder
        for _ in range(num_batches_in_step):
            batch_samples += [next(training_iterator)]
            
        # get local num items in batch 
        local_num_items_in_batch = sum([(batch["labels"].ne(-100)).sum() for batch in batch_samples])
        logger.warning(f"Step {update_step} - Device {accelerator.process_index} - num items in the local batch {local_num_items_in_batch}", main_process_only=False)

        # to compute it correctly in a multi-device DDP training, we need to gather the total number of items in the full batch.
        num_items_in_batch = accelerator.gather(local_num_items_in_batch).sum().item()
        logger.warning(f"Total num items {num_items_in_batch}")

        for i, batch in enumerate(batch_samples):
            inputs, labels = batch["input_ids"], batch["labels"]
            total_batched_samples += 1
            # if we perform gradient accumulation in a multi-devices set-up, we want to avoid unnecessary communications when accumulating
            # cf: https://muellerzr.github.io/blog/gradient_accumulation.html
            if (i < len(batch_samples) - 1 and accelerator.num_processes > 1):
                ctx = model.no_sync
            else:
                ctx = contextlib.nullcontext
            with ctx():

                outputs = model(inputs)
                loss = criterion(outputs.view(-1, 2), labels.view(-1).to(torch.int64))
                
                # We multiply by num_processes because the DDP calculates the average gradient across all devices whereas dividing by num_items_in_batch already takes into account all devices
                # Same reason for gradient_accumulation_steps, but this times it's Accelerate that calculate the average gradient across the accumulated steps 
                loss = (loss * gradient_accumulation_steps * accelerator.num_processes) / num_items_in_batch
                accelerator.backward(loss)
        model_optimizer.step()
        model_optimizer.zero_grad()
                

logger.warning(f"Device {accelerator.process_index} - w/ accumulation, the final model weight is {accelerator.unwrap_model(model).weight.detach().cpu().squeeze()}", main_process_only=False)

# We know do the same operation but on a single device and without gradient accumulation

if accelerator.is_main_process:
    # prepare one single entire batch
    dataloader = DataLoader(dataset, batch_size=len(dataset), collate_fn=collate_fn)
    full_batch_without_accum = next(iter(dataloader))
    total_inputs, total_labels = full_batch_without_accum["input_ids"], full_batch_without_accum["labels"]
    model_clone_optimizer = torch.optim.SGD(model_clone.parameters(), lr=0.08)
    
    # train the cloned model
    loss = torch.nn.CrossEntropyLoss(reduction="mean")(model_clone(total_inputs).view(-1, 2), total_labels.view(-1).to(torch.int64))
    model_clone_optimizer.zero_grad()
    loss.backward()
    model_clone_optimizer.step()
    
    # We should have the same final weights.
    logger.warning(f"w/o accumulation, the final model weight is {model_clone.weight.detach().cpu().squeeze()}")

```

Results on a single device - gradient accumulation steps set to 1 and batch_size set to 8:
```
initial model weight is tensor([-0.0075,  0.5364])
initial model clone weight is tensor([-0.0075,  0.5364])
Step 0 - Device 0 - num items in the local batch 36
Total num items 36
Device 0 - w/ accumulation, the final model weight is tensor([0.0953, 0.4337])
w/o accumulation, the final model weight is tensor([0.0953, 0.4337])
```

Results on a two devices set-up - gradient accumulation steps set to 2 and batch_size set to 4.
```
initial model weight is tensor([-0.0075,  0.5364])
initial model clone weight is tensor([-0.0075,  0.5364])
Step 0 - Device 0 - num items in the local batch 52
Step 0 - Device 1 - num items in the local batch 84
Total num items 136
Device 1 - w/ accumulation, the final model weight is tensor([0.2117, 0.3172])
Device 0 - w/ accumulation, the final model weight is tensor([0.2117, 0.3172])
w/o accumulation, the final model weight is tensor([0.2117, 0.3172])
```

### To go further:

Please find a complete example script on a real world training run in the examples folder at the path [`accelerate/examples/by_feature/gradient_accumulation_for_autoregressive_models.py`](https://github.com/huggingface/accelerate/blob/main/examples/by_feature/gradient_accumulation_for_autoregressive_models.py).

Running it on several training configurations with constant global batch size equal to 32 gives the following graph:

<div style="text-align: center">
<img src="https://huggingface.co/datasets/hf-audio/gradient_accumulation_example/resolve/main/training_losses.png">
</div>

Note that the training losses are exactly the same up to training step 20. The small deviation after this training step occurs at the very end of the first epoch, because, by [default](https://huggingface.co/docs/accelerate/en/package_reference/torch_wrappers#accelerate.data_loader.prepare_data_loader.even_batches), the dataloader duplicates the samples at the beginning of the dataset when the total batch size doesn't exactly divide the dataset.


================================================
FILE: docs/source/usage_guides/intel_cpu.md
================================================
<!--Copyright 2022 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Training on Intel CPU

## How It Works For Training optimization in CPU

Accelerate has full support for Intel CPU, all you need to do is enabling it through the config.

**Scenario 1**: Acceleration of No distributed CPU training

Run <u>accelerate config</u> on your machine:

```bash
$ accelerate config
-----------------------------------------------------------------------------------------------------------------------------------------------------------
In which compute environment are you running?
This machine
-----------------------------------------------------------------------------------------------------------------------------------------------------------
Which type of machine are you using?
No distributed training
Do you want to run your training on CPU only (even if a GPU / Apple Silicon device is available)? [yes/NO]:yes
Do you wish to optimize your script with torch dynamo?[yes/NO]:NO
Do you want to use DeepSpeed? [yes/NO]: NO
-----------------------------------------------------------------------------------------------------------------------------------------------------------
Do you wish to use FP16 or BF16 (mixed precision)?
bf16
```
This will generate a config file that will be used automatically to properly set the
default options when doing

```bash
accelerate launch my_script.py --args_to_my_script
```

For instance, here is how you would run the NLP example `examples/nlp_example.py` (from the root of the repo) with `default_config.yaml` which is generated by `accelerate config`

```bash
compute_environment: LOCAL_MACHINE
distributed_type: 'NO'
downcast_bf16: 'no'
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 1
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: true
```
```bash
accelerate launch examples/nlp_example.py
```

> [!CAUTION]
> `accelerator.prepare` can currently only handle simultaneously preparing multiple models (and no optimizer) OR a single model-optimizer pair for training. Other attempts (e.g., two model-optimizer pairs) will raise a verbose error. To work around this limitation, consider separately using `accelerator.prepare` for each model-optimizer pair.

**Scenario 2**: Acceleration of distributed CPU training
we use Intel oneCCL for communication, combined with Intel® MPI library to deliver flexible, efficient, scalable cluster messaging on Intel® architecture. you could refer the [here](https://huggingface.co/docs/transformers/perf_train_cpu_many) for the installation guide

Run <u>accelerate config</u> on your machine(node0):

```bash
$ accelerate config
-----------------------------------------------------------------------------------------------------------------------------------------------------------
In which compute environment are you running?
This machine
-----------------------------------------------------------------------------------------------------------------------------------------------------------
Which type of machine are you using?
multi-CPU
How many different machines will you use (use more than 1 for multi-node training)? [1]: 4
-----------------------------------------------------------------------------------------------------------------------------------------------------------
What is the rank of this machine?
0
What is the IP address of the machine that will host the main process? 36.112.23.24
What is the port you will use to communicate with the main process? 29500
Are all the machines on the same local network? Answer `no` if nodes are on the cloud and/or on different network hosts [YES/no]: yes
Do you want accelerate to launch mpirun? [yes/NO]: yes
Please enter the path to the hostfile to use with mpirun [~/hostfile]: ~/hostfile
Enter the number of oneCCL worker threads [1]: 1
Do you wish to optimize your script with torch dynamo?[yes/NO]:NO
How many processes should be used for distributed training? [1]:16
-----------------------------------------------------------------------------------------------------------------------------------------------------------
Do you wish to use FP16 or BF16 (mixed precision)?
bf16
```
For instance, here is how you would run the NLP example `examples/nlp_example.py` (from the root of the repo) for distributed CPU training.

`default_config.yaml` which is generated by `accelerate config`
```bash
compute_environment: LOCAL_MACHINE
distributed_type: MULTI_CPU
downcast_bf16: 'no'
machine_rank: 0
main_process_ip: 36.112.23.24
main_process_port: 29500
main_training_function: main
mixed_precision: bf16
mpirun_config:
  mpirun_hostfile: /home/user/hostfile
num_machines: 4
num_processes: 16
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: true
```

Set following env and using intel MPI to launch the training

In `node0`, you need to create a configuration file which contains the IP addresses of each node (for example hostfile) and pass that configuration file path as an argument.

If you selected to let Accelerate launch `mpirun`, ensure that the location of your hostfile matches the path in the config.

```bash
$ cat hostfile
xxx.xxx.xxx.xxx #node0 ip
xxx.xxx.xxx.xxx #node1 ip
xxx.xxx.xxx.xxx #node2 ip
xxx.xxx.xxx.xxx #node3 ip
```

```bash
accelerate launch examples/nlp_example.py
```

You can also directly launch distributed training with `mpirun` command, you need to run the following command in node0 and **16DDP** will be enabled in node0,node1,node2,node3 with BF16 mixed precision. When using this method, the python script, python environment, and accelerate config file need to be available on all of the machines used for multi-CPU training.

```bash
export MASTER_ADDR=xxx.xxx.xxx.xxx #node0 ip
mpirun -f hostfile -n 16 -ppn 4 accelerate launch examples/nlp_example.py
```


================================================
FILE: docs/source/usage_guides/local_sgd.md
================================================
<!--Copyright 2023 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Using Local SGD with Accelerate

Local SGD is a technique for distributed training where gradients are not synchronized every step. Thus, each process updates its own version of the model weights and after a given number of steps these weights are synchronized by averaging across all processes. This improves communication efficiency and can lead to substantial training speed up especially when a computer lacks a faster interconnect such as NVLink.
Unlike gradient accumulation (where improving communication efficiency requires increasing the effective batch size), Local SGD does not require changing a batch size or a learning rate / schedule. However, if necessary, Local SGD can be combined with gradient accumulation as well.

In this tutorial you will see how to quickly setup  Local SGD Accelerate. Compared to a standard Accelerate setup, this requires only two extra lines of code.

This example will use a very simplistic PyTorch training loop that performs gradient accumulation every two batches:

```python
device = "cuda"
model.to(device)

gradient_accumulation_steps = 2

for index, batch in enumerate(training_dataloader):
    inputs, targets = batch
    inputs = inputs.to(device)
    targets = targets.to(device)
    outputs = model(inputs)
    loss = loss_function(outputs, targets)
    loss = loss / gradient_accumulation_steps
    loss.backward()
    if (index + 1) % gradient_accumulation_steps == 0:
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
```

## Converting it to Accelerate

First the code shown earlier will be converted to use Accelerate  with neither a LocalSGD or a gradient accumulation helper:

```diff
+ from accelerate import Accelerator
+ accelerator = Accelerator()

+ model, optimizer, training_dataloader, scheduler = accelerator.prepare(
+     model, optimizer, training_dataloader, scheduler
+ )

  for index, batch in enumerate(training_dataloader):
      inputs, targets = batch
-     inputs = inputs.to(device)
-     targets = targets.to(device)
      outputs = model(inputs)
      loss = loss_function(outputs, targets)
      loss = loss / gradient_accumulation_steps
+     accelerator.backward(loss)
      if (index+1) % gradient_accumulation_steps == 0:
          optimizer.step()
          scheduler.step()
```

## Letting Accelerate handle model synchronization 

All that is left now is to let Accelerate handle model parameter synchronization **and** the gradient accumulation for us. For simplicity let us assume we need to synchronize every 8 steps. This is
achieved by adding one `with LocalSGD` statement and one call `local_sgd.step()` after every optimizer step:

```diff
+local_sgd_steps=8

+with LocalSGD(accelerator=accelerator, model=model, local_sgd_steps=8, enabled=True) as local_sgd:
    for batch in training_dataloader:
        with accelerator.accumulate(model):
            inputs, targets = batch
            outputs = model(inputs)
            loss = loss_function(outputs, targets)
            accelerator.backward(loss)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
+           local_sgd.step()
```

Under the hood, the Local SGD code **disables** automatic gradient synchronization (but accumulation still works as expected!). Instead it averages model parameters every `local_sgd_steps` steps (as well as at the end of the training loop).

## Limitations

The current implementation works only with basic multi-GPU (or multi-CPU) training without, e.g., [DeepSpeed.](https://github.com/deepspeedai/DeepSpeed).

## References

    Although we are not aware of the true origins of this simple approach, the idea of local SGD is quite old and goes
    back to at least:

    Zhang, J., De Sa, C., Mitliagkas, I., & Ré, C. (2016). [Parallel SGD: When does averaging help?. arXiv preprint
    arXiv:1606.07365.](https://huggingface.co/papers/1606.07365)

    We credit the term Local SGD to the following paper (but there might be earlier references we are not aware of).

    Stich, Sebastian Urban. ["Local SGD Converges Fast and Communicates Little." ICLR 2019-International Conference on
    Learning Representations. No. CONF. 2019.](https://huggingface.co/papers/1805.09767)


================================================
FILE: docs/source/usage_guides/low_precision_training.md
================================================
<!--Copyright 2023 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Low Precision Training Methods

Accelerate provides integrations to train on lower precision methods using specified supported hardware through the `TransformersEngine`, `MS-AMP`, and `torchao` packages. This documentation will help guide you through what hardware is supported, how to configure your [`Accelerator`] to leverage the low precision methods, and what you can expect when training. 

## What training on FP8 means

To explore more of the nitty-gritty in training in FP8 with PyTorch and Accelerate, check out the [concept_guide](../concept_guides/low_precision_training) on why this can be difficult. But essentially rather than training in BF16, some (or all) aspects of training a model can be performed using 8 bits instead of 16. The challenge is doing so without degrading final performance. 

This is only enabled on specific NVIDIA hardware, namely:

* Anything after the 3000 series consumer graphics cards (such as the 4090)
* Hopper-based GPU architectures (such as the `H100` and `H200`)

What this will result in is some reduction in the memory used (as we've cut the needed memory in half for some parts of training) and an increase in throughput *should* be seen as well for larger models that can replace certain layers with FP8-enabled ones.

## Configuring the Accelerator

Currently two actively maintained backends for FP8 are supported (`TransformersEngine` and `torchao`), each with different capabilities and configurations. A legacy `MS-AMP` backend also exists but is no longer recommended (see [below](#configuring-ms-amp) for details).

To use either, the same core API is used. Just pass `mixed_precision="fp8"` to either the [`Accelerator`], during `accelerate config` when prompted about mixed precision, or as part of your `config.yaml` file in the `mixed_precision` key:

```{python}
from accelerate import Accelerator
accelerator = Accelerator(mixed_precision="fp8")
```

To specify a backend (and customize other parts of the FP8 mixed precision setup), you can utilize one of the `RecipeKwargs` dataclasses such as [`utils.AORecipeKwargs`], [`utils.TERecipeKwargs`], or [`utils.MSAMPRecipeKwargs`]; you can also clarify it in your config `yaml`/during `accelerate launch`. We recommend using `TransformersEngine` or `torchao` for new projects:

```{python}
from accelerate import Accelerator
from accelerate.utils import TERecipeKwargs, AORecipeKwargs
# Use TransformersEngine
kwargs = [TERecipeKwargs()]
# Or to use torchao
# kwargs = [AORecipeKwargs()]
accelerator = Accelerator(mixed_precision="fp8", kwarg_handlers=kwargs)
```

```{yaml}
mixed_precision: fp8
fp8_config:
  amax_compute_algo: max
  amax_history_len: 1024
  backend: TE
  fp8_format: HYBRID
  interval: 1
  margin: 0
  override_linear_precision: (false, false, false)
  use_autocast_during_eval: false
```

## Configuring MS-AMP

<Tip warning={true}>

**⚠️ Deprecated / Unmaintained:** MS-AMP is no longer actively maintained by Microsoft. The [MS-AMP repository](https://github.com/Azure/MS-AMP) has not received updates since 2023 and has known compatibility issues:

- Requires CUDA 11.x (does not support CUDA 12.x+)
- Requires older NCCL versions incompatible with recent PyTorch releases
- Does not support recent PyTorch versions (2.2+)

**We strongly recommend using [`TransformersEngine`](#configuring-transformersengine) or [`torchao`](#configuring-torchao) instead for all new and existing FP8 training workflows.** Both are actively maintained and support modern CUDA/PyTorch versions. Native PyTorch FP8 support via `torchao` is particularly promising as a vendor-neutral solution.

The MS-AMP backend is retained in Accelerate for legacy compatibility but may be removed in a future release.

</Tip>

`MS-AMP` has a single configuration argument: the optimization level. 

Currently two levels of optimization are supported in the Accelerate integration, `"O1"` and `"O2"` (using the letter 'o', not zero). 

* `"O1"` will cast the weight gradients and `all_reduce` communications to happen in 8-bit, while the rest are done in 16 bit. This reduces the general GPU memory usage and speeds up communication bandwidths.
* `"O2"` will also cast first-order optimizer states into 8 bit, while the second order states are in FP16. (Currently just the `Adam` optimizer is supported). This tries its best to minimize final accuracy degradation and will save the highest potential memory.

To specify an optimization level, pass it to the `FP8KwargsHandler` by setting the `optimization_level` argument:

```{python}
from accelerate import Accelerator
from accelerate.utils import FP8RecipeKwargs
kwargs = [FP8RecipeKwargs(backend="msamp", optimization_level="O2")]
accelerator = Accelerator(mixed_precision="fp8", kwarg_handlers=kwargs)
```

Or during `accelerate launch` via `--fp8_backend=msamp --fp8_opt_level=O2`

Similarly this can be set in your `config.yaml`:

```{yaml}
mixed_precision: fp8
fp8_config:
    backend: MSAMP
    opt_level: O2
```

## Configuring TransformersEngine

TransformersEngine has many options for customizing how and what FP8 calculations are performed. A full list of supported arguments and what they mean are available in [NVIDIA's documentation](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/common.html), however they are restated as part of [`FP8KwargsHandler`]'s docstring for your convenience. 

Accelerate tries to set sensible defaults, but exploring and tweaking the various parameters yourself can lead to better performance potentially.

To use it, specify `backend="te"` and modify any of the arguments you want as part of your kwarg handler:

```{python}
from accelerate import Accelerator
from accelerate.utils import FP8RecipeKwargs
kwargs = [FP8RecipeKwargs(backend="te", ...)]
accelerator = Accelerator(mixed_precision="fp8", kwarg_handlers=kwargs)
```

Or during `accelerate launch` via `--fp8_backend=te ...`. Use `accelerate launch --fp8_backend=te -h` to see relevent arguments.

Similarly this can be set in your `config.yaml`:

```{yaml}
mixed_precision: fp8
fp8_config:
    amax_compute_algo: max
    amax_history_len: 1024
    backend: TE
    fp8_format: HYBRID
    interval: 1
    margin: 0
    override_linear_precision: (false, false, false)
    use_autocast_during_eval: false
```

## Configuring `torchao`

`torchao` is a [PyTorch-driven](https://github.com/pytorch/ao/tree/main/torchao/float8) hackable FP8 backend, aiming to be more approchable than the prior two engines. One of the core differences with `ao` compared to the prior two is that for numerical stability, it's found to be generally better off keeping the first *and* last layers in the model at the regular precision (be it FP32 or BF16), and then the other layers quantized down to FP8. As a result, a config for `ao` looks a bit differently:

> Note: this API is experimental and is subject to change

```{python}
from accelerate import Accelerator
from accelerate.utils import AORecipeKwargs, TorchDynamoPlugin, FullyShardedDataParallelPlugin
from torchao.float8 import Float8LinearConfig

fsdp2_plugin = FullyShardedDataParallelPlugin(
  fsdp_version=2,
  cpu_ram_efficient_loading=False, # CPU RAM efficient loading CANNOT work with fp8 torchao
  fsdp_auto_wrap_policy="TRANSFORMER_BASED_WRAP",
)
dynamo_plugin = TorchDynamoPlugin(
  backend="inductor",
  use_regional_compilation=True,
)
fp8_config = Float8LinearConfig(
  enable_fsdp_float8_all_gather=True, # Use FP8 all_gather in FSDP2
  pad_inner_dim=True,
)
kwargs = [AORecipeKwargs(
  config=fp8_config
)]
accelerator = Accelerator(
  mixed_precision="fp8",
  fsdp_plugin=fsdp2_plugin,
  dynamo_plugin=dynamo_plugin,
  kwarg_handlers=kwargs,
)
```

Or during `accelerate launch` via `--fp8_backend=ao ...`. Use `accelerate launch --fp8_backend=ao -h` to see relevent arguments.

Similarly, this can be set in `config.yaml`:

```{yaml}
mixed_precision: fp8
fsdp_config:
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_cpu_ram_efficient_loading: false
  fsdp_version: 2
fp8_config:
  backend: AO
  pad_inner_dim: true
  enable_fsdp_float8_all_gather: true
dynamo_config:
  dynamo_backend: INDUCTOR
  dynamo_use_regional_compilation: true
```

To learn more about the specific parameters to be used, please see the official `torchao` repo.


## Example Zoo

We have examples showcasing training with FP8 both with accelerate and its underlying implementation available in the accelerate repo.
Currently we support scripts showcasing:

* Single GPU
* Distributed Data Parallelism (Multi-GPU)
* Fully Sharded Data Parallelism
* DeepSpeed ZeRO 1 through 3

Find out more [here](https://github.com/huggingface/accelerate/tree/main/benchmarks/fp8)

## Further Reading

To learn more about training in FP8 please check out the following resources:

* [Our concept guide](../concept_guides/low_precision_training) detailing into more about TransformersEngine, torchao, and MS-AMP
* [The `transformers-engine` documentation](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/common.html)
* [The `torchao` documentation](https://github.com/pytorch/ao/tree/main/torchao/float8)
* [The `MS-AMP` documentation](https://azure.github.io/MS-AMP/docs/) (⚠️ no longer maintained)


================================================
FILE: docs/source/usage_guides/megatron_lm.md
================================================
<!--Copyright 2022 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->


# Megatron-LM

[Megatron-LM](https://github.com/NVIDIA/Megatron-LM) enables training large transformer language models at scale.
It provides efficient tensor, pipeline and sequence based model parallelism for pre-training transformer based
Language Models such as [GPT](https://huggingface.co/papers/2005.14165) (Decoder Only), [BERT](https://huggingface.co/papers/1810.04805) (Encoder Only) and [T5](https://huggingface.co/papers/1910.10683) (Encoder-Decoder).
For detailed information and how things work behind the scene please refer to the github [repo](https://github.com/NVIDIA/Megatron-LM).

## What is integrated?

Accelerate integrates following feature of Megatron-LM to enable large scale pre-training/finetuning
of BERT (Encoder), GPT (Decoder) or T5 models (Encoder and Decoder):

a. **Tensor Parallelism (TP)**: Reduces memory footprint without much additional communication on intra-node ranks.
Each tensor is split into multiple chunks with each shard residing on separate GPU. At each step, the same mini-batch of data is processed
independently and in parallel by each shard followed by syncing across all GPUs (`all-reduce` operation). 
In a simple transformer layer, this leads to 2 `all-reduces` in the forward path and 2 in the backward path.
For more details, please refer to the research paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using
Model Parallelism](https://huggingface.co/papers/1909.08053) and 
this section of blogpost [The Technology Behind BLOOM Training](https://huggingface.co/blog/bloom-megatron-deepspeed#tensor-parallelism).


b. **Pipeline Parallelism (PP)**: Reduces memory footprint and enables large scale training via inter-node parallelization. 
Reduces the bubble of naive PP via PipeDream-Flush schedule/1F1B schedule and Interleaved 1F1B schedule. 
Layers are distributed uniformly across PP stages. For example, if a model has `24` layers and we have `4` GPUs for
pipeline parallelism, each GPU will have `6` layers (24/4). For more details on schedules to reduce the idle time of PP,
please refer to the research paper [Efficient Large-Scale Language Model Training on GPU Clusters
Using Megatron-LM](https://huggingface.co/papers/2104.04473) and 
this section of blogpost [The Technology Behind BLOOM Training](https://huggingface.co/blog/bloom-megatron-deepspeed#pipeline-parallelism).

c. **Sequence Parallelism (SP)**: Reduces memory footprint without any additional communication. Only applicable when using TP.
It reduces activation memory required as it prevents the same copies to be on the tensor parallel ranks 
post `all-reduce` by replacing them with `reduce-scatter` and `no-op` operation would be replaced by `all-gather`. 
As `all-reduce = reduce-scatter + all-gather`, this saves a ton of activation memory at no added communication cost. 
To put it simply, it shards the outputs of each transformer layer along sequence dimension, e.g., 
if the sequence length is `1024` and the TP size is `4`, each GPU will have `256` tokens (1024/4) for each sample. 
This increases the batch size that can be supported for training. For more details, please refer to the research paper
[Reducing Activation Recomputation in Large Transformer Models](https://huggingface.co/papers/2205.05198). 

d. **Data Parallelism (DP)** via Distributed Optimizer: Reduces the memory footprint by sharding optimizer states and gradients across DP ranks
(versus the traditional method of replicating the optimizer state across data parallel ranks). 
For example, when using Adam optimizer with mixed-precision training, each parameter accounts for 12 bytes of memory.
This gets distributed equally across the GPUs, i.e., each parameter would account for 3 bytes (12/4) if we have 4 GPUs.
For more details, please refer to the research paper [ZeRO: Memory Optimizations Toward Training Trillion
Parameter Models](https://huggingface.co/papers/1910.02054) and following section of blog 
[The Technology Behind BLOOM Training](https://huggingface.co/blog/bloom-megatron-deepspeed#zero-data-parallelism).

e. **Expert Parallelism (EP)** Expert parallelism in Megatron-LM is used for Mixture-of-Experts (MoE) layers, where many “experts” (small feed-forward networks) exist but only a few are activated for each token. Instead of putting all experts on every GPU, Megatron distributes different experts across different GPUs—this is expert parallelism. During training, tokens are routed to the GPUs that host their selected experts, computed there, and then sent back, reducing memory cost. It often combines with tensor/pipeline parallelism for large-scale models.
f. **Full Activation Recomputation**: Reduces the memory footprint of activations significantly via smart activation checkpointing.
It doesn't store activations occupying large memory while being fast to recompute thereby achieving great tradeoff between memory and recomputation.
For example, for GPT-3, this leads to 70% reduction in required memory for activations at the expense of
only 2.7% FLOPs overhead for recomputation of activations. For more details, please refer to the research paper 
[Reducing Activation Recomputation in Large Transformer Models](https://huggingface.co/papers/2205.05198).

g. **Fused Kernels**: Fused Softmax, Mixed Precision Fused Layer Norm and Fused gradient accumulation to weight gradient computation of linear layer.
PyTorch JIT compiled Fused GeLU and Fused Bias+Dropout+Residual addition.

h. **Support for Indexed datasets**: Efficient binary format of datasets for large scale training. Support for the `mmap`, `cached` index file and the `lazy` loader format.

i. **Checkpoint reshaping and interoperability**: Utility for reshaping Megatron-LM checkpoints of variable 
tensor and pipeline parallel sizes to the beloved Transformers sharded checkpoints as it has great support with plethora of tools
such as Accelerate Big Model Inference, Megatron-DeepSpeed Inference etc. 
Support is also available for converting Transformers sharded checkpoints to Megatron-LM checkpoint of variable tensor and pipeline parallel sizes
for large scale training.  


## Pre-Requisites 

You will need to install the latest pytorch, cuda, nccl, and NVIDIA [APEX](https://github.com/NVIDIA/apex#quick-start) releases and the nltk library.
See [documentation](https://github.com/NVIDIA/Megatron-LM#setup) for more details. 
Another way to setup the environment is to pull an NVIDIA PyTorch Container that comes with all the required installations from NGC.

Below is a step-by-step method to set up the conda environment:

1. Create a virtual environment
```
conda create --name ml
```

2. Assuming that the machine has CUDA 11.3 installed, installing the corresponding PyTorch GPU Version
```
conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch
```

3. Install Nvidia APEX
```
git clone https://github.com/NVIDIA/apex
cd apex
pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
cd ..
```

4. Installing Megatron-LM

```
git clone https://github.com/NVIDIA/Megatron-LM.git
cd Megatron-LM
git checkout 9a1c0d05c992c8a241da384ab27dce2021bb56dd
you need to manually move gpt_builders.py to megatron/training and update
include = [
    "megatron.core", 
    "megatron.core.*",
    "megatron.training",
    "megatron.training.*",
    "megatron.legacy",
    "megatron.legacy.*",
]
in pyproject.toml file to unblock yourself from using Megatron
pip install --no-use-pep517 -e .
```

## Prepare Megaton-LM checkpoint
If you want to fine-tune a model, make sure you have a torch dist format checkpoint ready. If you only have access to the huggingface model, please consider converting it to a torch dist format checkpoint acceptable to Megatron. One examle can be using slime's script, take GLM models as an example:
```
source /your/path/to/slime/scripts/models/glm4.5-355B-A32B.sh
srun torchrun --nproc-per-node 8 \
   /your/path/to/slime/tools/convert_hf_to_torch_dist.py \
    ${MODEL_ARGS[@]} \
    --hf-checkpoint /your/path/to/huggingface/models/GLM4.5-355B-A32B \
    --save /your/path/to/megatron/models/GLM4.5-355B-A32B_torch_dist

```
After the conversion, make sure: 1. under `/your/path/to/megatron/models/GLM4.5-355B-A32B_torch_dist`: change the `latest_checkpointed_iteration.txt`'s content from `release` to `0` and rename the directory `release` to `iter_0000000`; 2: in the config, make sure `megatron_lm_no_load_optim` to be true so that no optimizer states are needed.

## Accelerate Megatron-LM Plugin

Important features are directly supported via the `accelerate config` command. 
An example of the corresponding questions for using Megatron-LM features is shown below:

```bash
:~$ accelerate config --config_file "megatron_gpt_config.yaml"
In which compute environment are you running? ([0] This machine, [1] AWS (Amazon SageMaker)): 0
Which type of machine are you using? ([0] No distributed training, [1] multi-CPU, [2] multi-GPU, [3] TPU): 2
How many different machines will you use (use more than 1 for multi-node training)? [1]: 
Do you want to use DeepSpeed? [yes/NO]: 
Do you want to use FullyShardedDataParallel? [yes/NO]: 
Do you want to use Megatron-LM ? [yes/NO]: yes
What is the Tensor Parallelism degree/size? [1]:2
Do you want to enable Sequence Parallelism? [YES/no]: 
What is the Pipeline Parallelism degree/size? [1]:2
What is the number of micro-batches? [1]:2
Do you want to enable selective activation recomputation? [YES/no]: 
Do you want to use distributed optimizer which shards optimizer state and gradients across data parallel ranks? [YES/no]: 
What is the gradient clipping value based on global L2 Norm (0 to disable)? [1.0]: 
How many GPU(s) should be used for distributed training? [1]:4
Do you wish to use FP16 or BF16 (mixed precision)? [NO/fp16/bf16]: bf16
```

The resulting config is shown below:

```
~$ cat megatron_gpt_config.yaml 
compute_environment: LOCAL_MACHINE
deepspeed_config: {}
distributed_type: MEGATRON_LM
downcast_bf16: 'no'
fsdp_config: {}
machine_rank: 0
main_process_ip: null
main_process_port: null
main_training_function: main
megatron_lm_config:
  megatron_lm_gradient_clipping: 1.0
  megatron_lm_num_micro_batches: 2
  megatron_lm_pp_degree: 2
  megatron_lm_recompute_activations: true
  megatron_lm_sequence_parallelism: true
  megatron_lm_tp_degree: 2
  megatron_lm_use_distributed_optimizer: true
mixed_precision: bf16
num_machines: 1
num_processes: 4
rdzv_backend: static
same_network: true
use_cpu: false
```

We will take the example of GPT pre-training. The minimal changes required to the official `run_clm_no_trainer.py` 
to use Megatron-LM are as follows:

1. As Megatron-LM uses its own implementation of Optimizer, the corresponding scheduler compatible with it needs to be used.
As such, support for only the Megatron-LM's scheduler is present. User will need to create `accelerate.utils.MegatronLMDummyScheduler`.
Example is given below:

```python
from accelerate.utils import MegatronLMDummyScheduler

if accelerator.distributed_type == DistributedType.MEGATRON_LM:
    lr_scheduler = MegatronLMDummyScheduler(
        optimizer=optimizer,
        total_num_steps=args.max_train_steps,
        warmup_num_steps=args.num_warmup_steps,
    )
else:
    lr_scheduler = get_scheduler(
        name=args.lr_scheduler_type,
        optimizer=optimizer,
        num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
    )
```

2. Getting the details of the total batch size now needs to be cognization of tensor and pipeline parallel sizes.
Example of getting the effective total batch size is shown below:

```python
if accelerator.distributed_type == DistributedType.MEGATRON_LM:
    total_batch_size = accelerator.state.megatron_lm_plugin.global_batch_size
else:
    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
```

3. When using Megatron-LM, the losses are already averaged across the data parallel group

```python
if accelerator.distributed_type == DistributedType.MEGATRON_LM:
    losses.append(loss)
else:
    losses.append(accelerator.gather_for_metrics(loss.repeat(args.per_device_eval_batch_size)))

if accelerator.distributed_type == DistributedType.MEGATRON_LM:
    losses = torch.tensor(losses)
else:
    losses = torch.cat(losses)
```

4. For Megatron-LM, we need to save the model using `accelerator.save_state`

```python
if accelerator.distributed_type == DistributedType.MEGATRON_LM:
    accelerator.save_state(args.output_dir)
else:
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(
        args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
    )
```

That's it! We are good to go 🚀. Please find the example script in the examples folder at the path `accelerate/examples/by_feature/megatron_lm_gpt_pretraining.py`.
Let's run it for `gpt-large` model architecture using 4 A100-80GB GPUs.

```bash
accelerate launch --config_file megatron_gpt_config.yaml \
examples/by_feature/megatron_lm_gpt_pretraining.py \
--config_name "gpt2-large" \
--tokenizer_name "gpt2-large" \
--dataset_name wikitext \
--dataset_config_name wikitext-2-raw-v1 \
--block_size 1024 \
--learning_rate 5e-5 \
--per_device_train_batch_size 24 \
--per_device_eval_batch_size 24 \
--num_train_epochs 5 \
--with_tracking \
--report_to "wandb" \
--output_dir "awesome_model"
```

Below are some important excerpts from the output logs:

```bash
Loading extension module fused_dense_cuda...
>>> done with compiling and loading fused kernels. Compilation time: 3.569 seconds
 > padded vocab (size: 50257) with 175 dummy tokens (new size: 50432)
Building gpt model in the pre-training mode.
The Megatron LM model weights are initialized at random in `accelerator.prepare`. Please use `accelerator.load_checkpoint` to load a pre-trained checkpoint matching the distributed setup.
Preparing dataloader
Preparing dataloader
Preparing model
 > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 210753280
 > number of parameters on (tensor, pipeline) model parallel rank (1, 1): 209445120
 > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 210753280
 > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 209445120
Preparing optimizer
Preparing scheduler
> learning rate decay style: linear
10/10/2022 22:57:22 - INFO - __main__ - ***** Running training *****
10/10/2022 22:57:22 - INFO - __main__ -   Num examples = 2318
10/10/2022 22:57:22 - INFO - __main__ -   Num Epochs = 5
10/10/2022 22:57:22 - INFO - __main__ -   Instantaneous batch size per device = 24
10/10/2022 22:57:22 - INFO - __main__ -   Total train batch size (w. parallel, distributed & accumulation) = 48
10/10/2022 22:57:22 - INFO - __main__ -   Gradient Accumulation steps = 1
10/10/2022 22:57:22 - INFO - __main__ -   Total optimization steps = 245
 20%|████████████▍                                                 | 49/245 [01:04<04:09,  1.27s/it]
 10/10/2022 22:58:29 - INFO - __main__ - epoch 0: perplexity: 1222.1594275215962 eval_loss: 7.10837459564209
 40%|████████████████████████▊                                     | 98/245 [02:10<03:07,  1.28s/it]
 10/10/2022 22:59:35 - INFO - __main__ - epoch 1: perplexity: 894.5236583794557 eval_loss: 6.796291351318359
 60%|████████████████████████████████████▌                        | 147/245 [03:16<02:05,  1.28s/it]
 10/10/2022 23:00:40 - INFO - __main__ - epoch 2: perplexity: 702.8458788508042 eval_loss: 6.555137634277344
 80%|████████████████████████████████████████████████▊            | 196/245 [04:22<01:02,  1.28s/it]
 10/10/2022 23:01:46 - INFO - __main__ - epoch 3: perplexity: 600.3220028695281 eval_loss: 6.39746618270874
100%|█████████████████████████████████████████████████████████████| 245/245 [05:27<00:00,  1.28s/it]
```

There are a large number of other options/features that one can set using `accelerate.utils.MegatronLMPlugin`.

## Advanced features to leverage writing custom train step and Megatron-LM Indexed Datasets

For leveraging more features, please go through below details.

1. Below is an example of changes required to customize the Train Step while using Megatron-LM. 
You will implement the `accelerate.utils.AbstractTrainStep` or inherit from their corresponding children 
`accelerate.utils.GPTTrainStep`, `accelerate.utils.BertTrainStep` or `accelerate.utils.T5TrainStep`.

```python
from accelerate.utils import MegatronLMDummyScheduler, GPTTrainStep, avg_losses_across_data_parallel_group


# Custom loss function for the Megatron model
class GPTTrainStepWithCustomLoss(GPTTrainStep):
    def __init__(self, megatron_args, **kwargs):
        super().__init__(megatron_args)
        self.kwargs = kwargs

    def get_loss_func(self):
        def loss_func(inputs, loss_mask, output_tensor):
            batch_size, seq_length = output_tensor.shape
            losses = output_tensor.float()
            loss_mask = loss_mask.view(-1).float()
            loss = losses.view(-1) * loss_mask

            # Resize and average loss per sample
            loss_per_sample = loss.view(batch_size, seq_length).sum(axis=1)
            loss_mask_per_sample = loss_mask.view(batch_size, seq_length).sum(axis=1)
            loss_per_sample = loss_per_sample / loss_mask_per_sample

            # Calculate and scale weighting
            weights = torch.stack([(inputs == kt).float() for kt in self.kwargs["keytoken_ids"]]).sum(axis=[0, 2])
            weights = 1.0 + self.kwargs["alpha"] * weights
            # Calculate weighted average
            weighted_loss = (loss_per_sample * weights).mean()

            # Reduce loss across data parallel groups
            averaged_loss = avg_losses_across_data_parallel_group([weighted_loss])

            return weighted_loss, {"lm loss": averaged_loss[0]}

        return loss_func

    def get_forward_step_func(self):
        def forward_step(data_iterator, model):
            """Forward step."""
            # Get the batch.
            tokens, labels, loss_mask, attention_mask, position_ids = self.get_batch(data_iterator)
            output_tensor = model(tokens, position_ids, attention_mask, labels=labels)

            return output_tensor, partial(self.loss_func, tokens, loss_mask)

        return forward_step


def main():
    # Custom loss function for the Megatron model
    keytoken_ids = []
    keywords = ["plt", "pd", "sk", "fit", "predict", " plt", " pd", " sk", " fit", " predict"]
    for keyword in keywords:
        ids = tokenizer([keyword]).input_ids[0]
        if len(ids) == 1:
            keytoken_ids.append(ids[0])
    accelerator.print(f"Keytoken ids: {keytoken_ids}")
    accelerator.state.megatron_lm_plugin.custom_train_step_class = GPTTrainStepWithCustomLoss
    accelerator.state.megatron_lm_plugin.custom_train_step_kwargs = {
        "keytoken_ids": keytoken_ids,
        "alpha": 0.25,
    }
```

2. For using the Megatron-LM datasets, a few more changes are required. Dataloaders for these datasets
are available only on rank 0 of each tensor parallel group. As such, there are rank where dataloader won't be
available and this requires tweaks to the training loop. Being able to do all this shows how
flexible and extensible Accelerate is. The changes required are as follows.

a. For Megatron-LM indexed datasets, we need to use `MegatronLMDummyDataLoader` 
and pass the required dataset args to it such as `data_path`, `seq_length` etc. 
See [here](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/arguments.py#L804) for the list of available args. 
    
```python
from accelerate.utils import MegatronLMDummyDataLoader

megatron_dataloader_config = {
    "data_path": args.data_path,
    "splits_string": args.splits_string,
    "seq_length": args.block_size,
    "micro_batch_size": args.per_device_train_batch_size,
}
megatron_dataloader = MegatronLMDummyDataLoader(**megatron_dataloader_config)
accelerator.state.megatron_lm_plugin.megatron_dataset_flag = True
```

b. `megatron_dataloader` is repeated 3 times to get training, validation and test dataloaders
as per the `args.splits_string` proportions
    
```python
model, optimizer, lr_scheduler, train_dataloader, eval_dataloader, _ = accelerator.prepare(
    model, optimizer, lr_scheduler, megatron_dataloader, megatron_dataloader, megatron_dataloader
)
```

c. Changes to training and evaluation loops as dataloader is only available on tensor parallel ranks 0
So, we need to iterate only if the dataloader isn't `None` else provide empty dict
As such, we loop using `while` loop and break when `completed_steps` is equal to `args.max_train_steps`
This is similar to the Megatron-LM setup wherein user has to provide `max_train_steps` when using Megaton-LM indexed datasets.
This displays how flexible and extensible Accelerate is.

```python
while completed_steps < args.max_train_steps:
    model.train()
    batch = next(train_dataloader) if train_dataloader is not None else {}
    outputs = model(**batch)
    loss = outputs.loss
    ...

    if completed_steps % eval_interval == 0:
        eval_completed_steps = 0
        losses = []
        while eval_completed_steps < eval_iters:
            model.eval()
            with torch.no_grad():
                batch = next(eval_dataloader) if eval_dataloader is not None else {}
                outputs = model(**batch)
```

    
## Utility for Checkpoint reshaping and interoperability

1. The scripts for these are present in Transformers library under respective models. 
Currently, it is available for GPT model [checkpoint_reshaping_and_interoperability.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py)

2. Below is an example of conversion of checkpoint from Megatron-LM to universal Transformers sharded checkpoint.
```bash
python checkpoint_reshaping_and_interoperability.py \
--convert_checkpoint_from_megatron_to_transformers \
--load_path "gpt/iter_0005000" \
--save_path "gpt/trfs_checkpoint" \
--max_shard_size "200MB" \
--tokenizer_name "gpt2" \
--print-checkpoint-structure
```

3. Conversion of checkpoint from transformers to megatron with `tp_size=2`, `pp_size=2` and `dp_size=2`.
```bash
python checkpoint_utils/megatgron_gpt2/checkpoint_reshaping_and_interoperability.py \
--load_path "gpt/trfs_checkpoint" \
--save_path "gpt/megatron_lm_checkpoint" \
--target_tensor_model_parallel_size 2 \
--target_pipeline_model_parallel_size 2 \
--target_data_parallel_size 2 \
--target_params_dtype "bf16" \
--make_vocab_size_divisible_by 128 \
--use_distributed_optimizer \
--print-checkpoint-structure
```

## Megatron-LM GPT models support returning logits and `megatron_generate` function for text generation

1. Returning logits require setting `require_logits=True` in MegatronLMPlugin as shown below. 
These would be available in the last stage of pipeline.
```python
megatron_lm_plugin = MegatronLMPlugin(return_logits=True)
```

2. `megatron_generate` method for Megatron-LM GPT model: This will use Tensor and Pipeline Parallelism to complete 
generations for a batch of inputs when using greedy with/without top_k/top_p sampling and for individual prompt inputs when using beam search decoding. 
Only a subset of features of transformers generate is supported. This will help in using large models via tensor and pipeline parallelism 
for generation (already does key-value caching and uses fused kernels by default).
This requires data parallel size to be 1, sequence parallelism and activation checkpointing to be disabled.
It also requires specifying path to tokenizer's vocab file and merges file. 
Below example shows how to configure and use `megatron_generate` method for Megatron-LM GPT model.
```python
# specifying tokenizer's vocab and merges file
vocab_file = os.path.join(args.resume_from_checkpoint, "vocab.json")
merge_file = os.path.join(args.resume_from_checkpoint, "merges.txt")
other_megatron_args = {"vocab_file": vocab_file, "merge_file": merge_file}
megatron_lm_plugin = MegatronLMPlugin(other_megatron_args=other_megatron_args)

# inference using `megatron_generate` functionality
tokenizer.pad_token = tokenizer.eos_token
max_new_tokens = 64
batch_texts = [
    "Are you human?",
    "The purpose of life is",
    "The arsenal was constructed at the request of",
    "How are you doing these days?",
]
batch_encodings = tokenizer(batch_texts, return_tensors="pt", padding=True)

# top-p sampling
generated_tokens = model.megatron_generate(
    batch_encodings["input_ids"],
    batch_encodings["attention_mask"],
    max_new_tokens=max_new_tokens,
    top_p=0.8,
    top_p_decay=0.5,
    temperature=0.9,
)
decoded_preds = tokenizer.batch_decode(generated_tokens.cpu().numpy())
accelerator.print(decoded_preds)

# top-k sampling
generated_tokens = model.megatron_generate(
    batch_encodings["input_ids"],
    batch_encodings["attention_mask"],
    max_new_tokens=max_new_tokens,
    top_k=50,
    temperature=0.9,
)
decoded_preds = tokenizer.batch_decode(generated_tokens.cpu().numpy())
accelerator.print(decoded_preds)

# adding `bos` token at the start
generated_tokens = model.megatron_generate(
    batch_encodings["input_ids"], batch_encodings["attention_mask"], max_new_tokens=max_new_tokens, add_BOS=True
)
decoded_preds = tokenizer.batch_decode(generated_tokens.cpu().numpy())
accelerator.print(decoded_preds)

# beam search => only takes single prompt
batch_texts = ["The purpose of life is"]
batch_encodings = tokenizer(batch_texts, return_tensors="pt", padding=True)
generated_tokens = model.megatron_generate(
    batch_encodings["input_ids"],
    batch_encodings["attention_mask"],
    max_new_tokens=max_new_tokens,
    num_beams=20,
    length_penalty=1.5,
)
decoded_preds = tokenizer.batch_decode(generated_tokens.cpu().numpy())
accelerator.print(decoded_preds)
```

3. An end-to-end example of using `megatron_generate` method for Megatron-LM GPT model is available at
[megatron_gpt2_generation.py](https://github.com/pacman100/accelerate-megatron-test/blob/main/src/inference/megatron_gpt2_generation.py) with 
config file [megatron_lm_gpt_generate_config.yaml](https://github.com/pacman100/accelerate-megatron-test/blob/main/src/Configs/megatron_lm_gpt_generate_config.yaml).
The bash script with accelerate launch command is available at [megatron_lm_gpt_generate.sh](https://github.com/pacman100/accelerate-megatron-test/blob/main/megatron_lm_gpt_generate.sh).
The output logs of the script are available at [megatron_lm_gpt_generate.log](https://github.com/pacman100/accelerate-megatron-test/blob/main/output_logs/megatron_lm_gpt_generate.log).

## Support for ROPE and ALiBi Positional embeddings and Multi-Query Attention

1. For ROPE/ALiBi attention, pass `position_embedding_type` with `("absolute" | "rotary" | "alibi")` to `MegatronLMPlugin` as shown below.
```python
other_megatron_args = {"position_embedding_type": "alibi"}
megatron_lm_plugin = MegatronLMPlugin(other_megatron_args=other_megatron_args)
```

2. For Multi-Query Attention, pass `attention_head_type` with `("multihead" | "multiquery")` to `MegatronLMPlugin` as shown below.
```python
other_megatron_args = {"attention_head_type": "multiquery"}
megatron_lm_plugin = MegatronLMPlugin(other_megatron_args=other_megatron_args)
```

## Caveats

1. Supports Transformers GPT2, Megatron-BERT and T5 models.
This covers Decoder only, Encode only and Encoder-Decoder model classes.

2. Only loss is returned from model forward pass as 
there is quite complex interplay of pipeline, tensor and data parallelism behind the scenes.
The `model(**batch_data)` call return loss(es) averaged across the data parallel ranks.
This is fine for most cases wherein pre-training jobs are run using Megatron-LM features and
you can easily compute the `perplexity` using the loss. 
For GPT model, returning logits in addition to loss(es) is supported. 
These logits aren't gathered across data parallel ranks. Use `accelerator.utils.gather_across_data_parallel_groups`
to gather logits across data parallel ranks. These logits along with labels can be used for computing various 
performance metrics. 

3. The main process is the last rank as the losses/logits are available in the last stage of pipeline.
`accelerator.is_main_process` and `accelerator.is_local_main_process` return `True` for last rank when using 
Megatron-LM integration.

4. In `accelerator.prepare` call, a Megatron-LM model corresponding to a given Transformers model is created
with random weights. Please use `accelerator.load_state` to load the Megatron-LM checkpoint with matching TP, PP and DP partitions.

5. Currently, checkpoint reshaping and interoperability support is only available for GPT. 
Soon it will be extended to BERT and T5.

6. `gradient_accumulation_steps` needs to be 1. When using Megatron-LM, micro batches in pipeline parallelism 
setting is synonymous with gradient accumulation. 

7. When using Megatron-LM, use `accelerator.save_state` and `accelerator.load_state` for saving and loading checkpoints.

8. Below are the mapping from Megatron-LM model architectures to the equivalent transformers model architectures.
Only these transformers model architectures are supported.

a. Megatron-LM [BertModel](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/bert_model.py) : 
transformers models with `megatron-bert` in config's model type, e.g., 
[MegatronBERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)
    
b. Megatron-LM [GPTModel](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py) : 
transformers models with `gpt2` in config's model type, e.g., 
[OpenAI GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2)
   
c. Megatron-LM [T5Model](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/t5_model.py) : 
transformers models with `t5` in  config's model type, e.g., 
[T5](https://huggingface.co/docs/transformers/model_doc/t5) and 
[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)


================================================
FILE: docs/source/usage_guides/model_size_estimator.md
================================================
<!--Copyright 2022 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Model memory estimator

One very difficult aspect when exploring potential models to use on your machine is knowing just how big of a model will *fit* into memory with your current device (such as loading the model onto CUDA or XPU).

To help alleviate this, Accelerate has a CLI interface through `accelerate estimate-memory`. This tutorial will 
help walk you through using it, what to expect, and at the end link to the interactive demo hosted on the Hub which will 
even let you post those results directly on the model repo!

Currently we support searching for models that can be used in `timm` and `transformers`.

<Tip>

    This API will load the model into memory on the `meta` device, so we are not actually downloading 
    and loading the full weights of the model into memory, nor do we need to. As a result it's 
    perfectly fine to measure 8 billion parameter models (or more), without having to worry about 
    if your CPU can handle it!

</Tip>

## Gradio Demos

Below are a few gradio demos related to what was described above. The first is the official Hugging Face memory estimation space, utilizing Accelerate directly:

<div class="block dark:hidden">
	<iframe 
        src="https://hf-accelerate-model-memory-usage.hf.space?__theme=light"
        width="850"
        height="1600"
    ></iframe>
</div>
<div class="hidden dark:block">
    <iframe 
        src="https://hf-accelerate-model-memory-usage.hf.space?__theme=dark"
        width="850"
        height="1600"
    ></iframe>
</div>

A community member has taken the idea and expanded it further, allowing you to filter models directly and see if you can run a particular LLM given GPU constraints and LoRA configurations. To play with it, see [here](https://huggingface.co/spaces/Vokturz/can-it-run-llm) for more details.

## The Command

When using `accelerate estimate-memory`, you need to pass in the name of the model you want to use, potentially the framework
that model utilizing (if it can't be found automatically), and the data types you want the model to be loaded in with.

For example, here is how we can calculate the memory footprint for `bert-base-cased`:

```bash
accelerate estimate-memory bert-base-cased
```

This will download the `config.json` for `bert-based-cased`, load the model on the `meta` device, and report back how much space
it will use:

Memory Usage for loading `bert-base-cased`:

| dtype   | Largest Layer | Total Size | Training using Adam |
|---------|---------------|------------|---------------------|
| float32 | 84.95 MB      | 418.18 MB  | 1.61 GB             |
| float16 | 42.47 MB      | 206.59 MB  | 826.36 MB           |
| int8    | 21.24 MB      | 103.29 MB  | 413.18 MB           |
| int4    | 10.62 MB      | 51.65 MB   | 206.59 MB           |

By default it will return all the supported dtypes (`int4` through `float32`), but if you are interested in specific ones these can be filtered.

### Specific libraries

If the source library cannot be determined automatically (like it could in the case of `bert-base-cased`), a library name can
be passed in. 

```bash
accelerate estimate-memory HuggingFaceM4/idefics-80b-instruct --library_name transformers
```

Memory Usage for loading `HuggingFaceM4/idefics-80b-instruct`:

| dtype   | Largest Layer | Total Size | Training using Adam |
|---------|---------------|------------|---------------------|
| float32 | 3.02 GB       | 297.12 GB  | 1.16 TB             |
| float16 | 1.51 GB       | 148.56 GB  | 594.24 GB           |
| int8    | 772.52 MB     | 74.28 GB   | 297.12 GB           |
| int4    | 386.26 MB     | 37.14 GB   | 148.56 GB           |


```bash
accelerate estimate-memory timm/resnet50.a1_in1k --library_name timm
```

Memory Usage for loading `timm/resnet50.a1_in1k`:

| dtype   | Largest Layer | Total Size | Training using Adam |
|---------|---------------|------------|---------------------|
| float32 | 9.0 MB        | 97.7 MB    | 390.78 MB           |
| float16 | 4.5 MB        | 48.85 MB   | 195.39 MB           |
| int8    | 2.25 MB       | 24.42 MB   | 97.7 MB             |
| int4    | 1.12 MB       | 12.21 MB   | 48.85 MB            |

### Specific dtypes

As mentioned earlier, while we return `int4` through `float32` by default, any dtype can be used from `float32`, `float16`, `int8`, and `int4`.

To do so, pass them in after specifying `--dtypes`:

```bash
accelerate estimate-memory bert-base-cased --dtypes float32 float16
```

Memory Usage for loading `bert-base-cased`:

| dtype   | Largest Layer | Total Size | Training using Adam |
|---------|---------------|------------|---------------------|
| float32 | 84.95 MB      | 413.18 MB  | 1.61 GB             |
| float16 | 42.47 MB      | 206.59 MB  | 826.36 MB           |

## Caveats with this calculator

This calculator will tell you how much memory is needed to purely load the model in, *not* to perform inference.

This calculation is accurate within a few % of the actual value, so it is a very good view of just how much memory it will take. For instance loading `bert-base-cased` actually takes `413.68 MB` when loaded on CUDA in full precision, and the calculator estimates `413.18 MB`.

When performing inference you can expect to add up to an additional 20% as found by [EleutherAI](https://blog.eleuther.ai/transformer-math/). We'll be conducting research into finding a more accurate estimate to these values, and will update 
this calculator once done.


================================================
FILE: docs/source/usage_guides/mps.md
================================================
<!--Copyright 2022 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Accelerated PyTorch Training on Mac

With PyTorch v1.12 release, developers and researchers can take advantage of Apple silicon GPUs for significantly faster model training. 
This unlocks the ability to perform machine learning workflows like prototyping and fine-tuning locally, right on Mac.
Apple's Metal Performance Shaders (MPS) as a backend for PyTorch enables this and can be used via the new `"mps"` device. 
This will map computational graphs and primitives on the MPS Graph framework and tuned kernels provided by MPS.
For more information please refer official documents [Introducing Accelerated PyTorch Training on Mac](https://pytorch.org/blog/introducing-accelerated-pytorch-training-on-mac/)
and [MPS BACKEND](https://pytorch.org/docs/stable/notes/mps.html).

### Benefits of Training and Inference using Apple Silicon Chips

1. Enables users to train larger networks or batch sizes locally
2. Reduces data retrieval latency and provides the GPU with direct access to the full memory store due to unified memory architecture. 
Therefore, improving end-to-end performance.
3. Reduces costs associated with cloud-based development or the need for additional local GPUs.

**Pre-requisites**: To install torch with mps support, 
please follow this nice medium article [GPU-Acceleration Comes to PyTorch on M1 Macs](https://medium.com/towards-data-science/gpu-acceleration-comes-to-pytorch-on-m1-macs-195c399efcc1).


## How it works out of the box
It is enabled by default on MacOs machines with MPS enabled Apple Silicon GPUs.
To disable it, pass `--cpu` flag to `accelerate launch` command or answer the corresponding question when answering the `accelerate config` questionnaire.

You can directly run the following script to test it out on MPS enabled Apple Silicon machines:
```bash
accelerate launch /examples/cv_example.py --data_dir images
```

## A few caveats to be aware of

1. Distributed setups `gloo` and `nccl` are not working with `mps` device. 
This means that currently only single GPU of `mps` device type can be used.

Finally, please, remember that, `Accelerate` only integrates MPS backend, therefore if you
have any problems or questions with regards to MPS backend usage, please, file an issue with [PyTorch GitHub](https://github.com/pytorch/pytorch/issues).

================================================
FILE: docs/source/usage_guides/profiler.md
================================================
<!--
Copyright 2024 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Profiler

Profiler is a tool that allows the collection of performance metrics during training and inference. Profiler’s context manager API can be used to better understand what model operators are the most expensive, examine their input shapes and stack traces, study device kernel activity, and visualize the execution trace. It provides insights into the performance of your model, allowing you to optimize and improve it.

This guide explains how to use PyTorch Profiler to measure the time and memory consumption of the model’s operators and how to integrate this with Accelerate. We will cover various use cases and provide examples for each.

## Using profiler to analyze execution time

Profiler allows one to check which operators were called during the execution of a code range wrapped with a profiler context manager.

Let’s see how we can use profiler to analyze the execution time:

<hfoptions id="cpu execution time">
<hfoption id="PyTorch">

```python
import torch
import torchvision.models as models
from torch.profiler import profile, record_function, ProfilerActivity

model = models.resnet18()
inputs = torch.randn(5, 3, 224, 224)

with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
    model(inputs)

print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))
```

</hfoption>
<hfoption id="Accelerate">

```python
from accelerate import Accelerator, ProfileKwargs
import torch
import torchvision.models as models

model = models.resnet18()
inputs = torch.randn(5, 3, 224, 224)

profile_kwargs = ProfileKwargs(
    activities=["cpu"],
    record_shapes=True
)

accelerator = Accelerator(cpu=True, kwargs_handlers=[profile_kwargs])
model = accelerator.prepare(model)

with accelerator.profile() as prof:
    with torch.no_grad():
        model(inputs)

print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))
```

</hfoption>
</hfoptions>

The resulting table output (omitting some columns):

```
---------------------------------  ------------  ------------  ------------  ------------  
                             Name      Self CPU     CPU total  CPU time avg    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  
                     aten::conv2d     171.000us      52.260ms       2.613ms            20  
                aten::convolution     227.000us      52.089ms       2.604ms            20  
               aten::_convolution     270.000us      51.862ms       2.593ms            20  
         aten::mkldnn_convolution      51.273ms      51.592ms       2.580ms            20  
                 aten::batch_norm     118.000us       7.059ms     352.950us            20  
     aten::_batch_norm_impl_index     315.000us       6.941ms     347.050us            20  
          aten::native_batch_norm       6.305ms       6.599ms     329.950us            20  
                 aten::max_pool2d      40.000us       4.008ms       4.008ms             1  
    aten::max_pool2d_with_indices       3.968ms       3.968ms       3.968ms             1  
                       aten::add_     780.000us     780.000us      27.857us            28  
---------------------------------  ------------  ------------  ------------  ------------  
Self CPU time total: 67.016ms
```

To get a finer granularity of results and include operator input shapes, pass `group_by_input_shape=True` (note: this requires running the profiler with `record_shapes=True`):

```python
print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total", row_limit=10))
```

## Using profiler to analyze memory consumption

Profiler can also show the amount of memory (used by the model’s tensors) that was allocated (or released) during the execution of the model’s operators. To enable memory profiling functionality pass `profile_memory=True`.

<hfoptions id="memory consumption">
<hfoption id="PyTorch">

```python
model = models.resnet18()
inputs = torch.randn(5, 3, 224, 224)

with profile(activities=[ProfilerActivity.CPU],
        profile_memory=True, record_shapes=True) as prof:
    model(inputs)

print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10))
```

</hfoption>
<hfoption id="Accelerate">

```python
model = models.resnet18()
inputs = torch.randn(5, 3, 224, 224)

profile_kwargs = ProfileKwargs(
    activities=["cpu"],
    profile_memory=True,
    record_shapes=True
)

accelerator = Accelerator(cpu=True, kwargs_handlers=[profile_kwargs])
model = accelerator.prepare(model)

with accelerator.profile() as prof:
    model(inputs)

print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10))
```

</hfoption>
</hfoptions>

The resulting table output (omitting some columns):

```
---------------------------------  ------------  ------------  ------------  
                             Name       CPU Mem  Self CPU Mem    # of Calls  
---------------------------------  ------------  ------------  ------------  
                      aten::empty      94.85 Mb      94.85 Mb           205  
    aten::max_pool2d_with_indices      11.48 Mb      11.48 Mb             1  
                      aten::addmm      19.53 Kb      19.53 Kb             1  
                       aten::mean      10.00 Kb      10.00 Kb             1  
              aten::empty_strided         492 b         492 b             5  
                        aten::cat         240 b         240 b             6  
                        aten::abs         480 b         240 b             4  
              aten::masked_select         120 b         112 b             1  
                         aten::ne          61 b          53 b             3  
                         aten::eq          30 b          30 b             1  
---------------------------------  ------------  ------------  ------------  
Self CPU time total: 69.332ms
```


## Exporting chrome trace

You can examine the sequence of profiled operators and CUDA kernels in Chrome trace viewer (`chrome://tracing`):

![profile_export](https://github.com/huggingface/accelerate/assets/100389977/5acb193f-6d11-4f7b-9873-c600c19e8172)

<hfoptions id="exporting chrome trace">
<hfoption id="PyTorch">

```python
model = models.resnet18().cuda()
inputs = torch.randn(5, 3, 224, 224).cuda()

with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
    model(inputs)

prof.export_chrome_trace("trace.json")
```

</hfoption>
<hfoption id="Accelerate">

```python
model = models.resnet18()
inputs = torch.randn(5, 3, 224, 224).cuda()
profile_kwargs = ProfileKwargs(
    activities=["cpu", "cuda"],
    output_trace_dir="trace"
)

accelerator = Accelerator(kwargs_handlers=[profile_kwargs])
model = accelerator.prepare(model)

with accelerator.profile() as prof:
    model(inputs)

# The trace will be saved to the specified directory
```
For other hardware accelerators, e.g. XPU, you can change `cuda` to `xpu` in the above example code.

</hfoption>
</hfoptions>

## Using Profiler to Analyze Long-Running Jobs

Profiler offers an additional API to handle long-running jobs (such as training loops). Tracing all of the execution can be slow and result in very large trace files. To avoid this, use optional arguments:

- `schedule_option`: Scheduling options allow you to control when profiling is active. This is useful for long-running jobs to avoid collecting too much data. Available keys are `wait`, `warmup`, `active`, `repeat` and `skip_first`. The profiler will skip the first `skip_first` steps, then wait for `wait` steps, then do the warmup for the next `warmup` steps, then do the active recording for the next `active` steps and then repeat the cycle starting with `wait` steps. The optional number of cycles is specified with the `repeat` parameter, the zero value means that the cycles will continue until the profiling is finished.
- `on_trace_ready`: specifies a function that takes a reference to the profiler as an input and is called by the profiler each time the new trace is ready.

To illustrate how the API works, consider the following example:

<hfoptions id="custom handler">
<hfoption id="PyTorch">

```python
from torch.profiler import schedule

my_schedule = schedule(
    skip_first=1,
    wait=5,
    warmup=1,
    active=3,
    repeat=2
)

def trace_handler(p):
    output = p.key_averages().table(sort_by="self_cuda_time_total", row_limit=10)
    print(output)
    p.export_chrome_trace("/tmp/trace_" + str(p.step_num) + ".json")

with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    schedule=my_schedule,
    on_trace_ready=trace_handler
) as p:
    for idx in range(8):
        model(inputs)
        p.step()
```

</hfoption>
<hfoption id="Accelerate">

```python
def trace_handler(p):
    output = p.key_averages().table(sort_by="self_cuda_time_total", row_limit=10)
    print(output)
    p.export_chrome_trace("/tmp/trace_" + str(p.step_num) + ".json")

profile_kwargs = ProfileKwargs(
    activities=["cpu", "cuda"],
    schedule_option={"wait": 5, "warmup": 1, "active": 3, "repeat": 2, "skip_first": 1},
    on_trace_ready=trace_handler
)

accelerator = Accelerator(kwargs_handlers=[profile_kwargs])
model = accelerator.prepare(model)

with accelerator.profile() as prof:
    for idx in range(8):
        model(inputs)
        prof.step()
```

</hfoption>
</hfoptions>

## FLOPS

Use formula to estimate the FLOPs (floating point operations) of specific operators (matrix multiplication and 2D convolution).

To measure floating-point operations (FLOPS):

<hfoptions id="FLOPS">
<hfoption id="PyTorch">

```python
with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    with_flops=True
) as prof:
    model(inputs)

print(prof.key_averages().table(sort_by="flops", row_limit=10))
```

</hfoption>
<hfoption id="Accelerate">

```python
profile_kwargs = ProfileKwargs(
    with_flops=True
)
accelerator = Accelerator(kwargs_handlers=[profile_kwargs])

with accelerator.profile() as prof:
    model(inputs)

print(prof.key_averages().table(sort_by="flops", row_limit=10))
```

</hfoption>
</hfoptions>

The resulting table output (omitting some columns):

```
-------------------------------------------------------  ------------  ------------  ------------  
                                                   Name      Self CPU     Self CUDA    Total FLOPs  
-------------------------------------------------------  ------------  ------------  ------------  
                                           aten::conv2d     197.000us       0.000us  18135613440.000  
                                            aten::addmm     103.000us      17.000us     5120000.000  
                                              aten::mul      29.000us       2.000us          30.000  
                                      aten::convolution     409.000us       0.000us            --  
                                     aten::_convolution     253.000us       0.000us            --  
                                aten::cudnn_convolution       5.465ms       2.970ms            --  
                                        cudaEventRecord     138.000us       0.000us            --  
                                  cudaStreamIsCapturing      43.000us       0.000us            --  
                                  cudaStreamGetPriority      40.000us       0.000us            --  
                       cudaDeviceGetStreamPriorityRange      10.000us       0.000us            --  
-------------------------------------------------------  ------------  ------------  ------------  
Self CPU time total: 21.938ms
Self CUDA time total: 4.165ms
```


## Conclusion and Further Information

PyTorch Profiler is a powerful tool for analyzing the performance of your models. By integrating it with Accelerate, you can easily profile your models and gain insights into their performance, helping you to optimize and improve them.

For more detailed information, refer to the [PyTorch Profiler documentation](https://pytorch.org/docs/stable/profiler.html).

================================================
FILE: docs/source/usage_guides/quantization.md
================================================
<!--Copyright 2023 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Model quantization

## `bitsandbytes` Integration

Accelerate brings `bitsandbytes` quantization to your model. You can now load any pytorch model in 8-bit or 4-bit with a few lines of code.

If you want to use Transformers models with `bitsandbytes`, you should follow this [documentation](https://huggingface.co/docs/transformers/main_classes/quantization). 

To learn more about how the `bitsandbytes` quantization works, check out the blog posts on [8-bit quantization](https://huggingface.co/blog/hf-bitsandbytes-integration) and [4-bit quantization](https://huggingface.co/blog/4bit-transformers-bitsandbytes).

### Pre-Requisites
You will need to install the following requirements:

- Install `bitsandbytes` library
```bash
pip install bitsandbytes
```
For non-cuda devices, you can refer to the bitsandbytes installation guide [here](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend).

- Install latest `accelerate` from source
```bash
pip install git+https://github.com/huggingface/accelerate.git
```
- Install `minGPT` and `huggingface_hub` to run examples
```bash
git clone https://github.com/karpathy/minGPT.git
pip install minGPT/
pip install huggingface_hub
```

### How it works

First, we need to initialize our model. To save memory, we can initialize an empty model using the context manager [`init_empty_weights`]. 

Let's take the GPT2 model from minGPT library.
```py
from accelerate import init_empty_weights
from mingpt.model import GPT

model_config = GPT.get_default_config()
model_config.model_type = 'gpt2-xl'
model_config.vocab_size = 50257
model_config.block_size = 1024

with init_empty_weights():
    empty_model = GPT(model_config)
```

Then, we need to get the path to the weights of your model. The path can be the state_dict file (e.g. "pytorch_model.bin") or a folder containing the sharded checkpoints. 

```py
from huggingface_hub import snapshot_download
weights_location = snapshot_download(repo_id="marcsun13/gpt2-xl-linear-sharded")
```

Finally, you need to set your quantization configuration with [`~utils.BnbQuantizationConfig`].

Here's an example for 8-bit quantization:
```py
from accelerate.utils import BnbQuantizationConfig
bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True, llm_int8_threshold = 6)
```

Here's an example for 4-bit quantization:
```py
from accelerate.utils import BnbQuantizationConfig
bnb_quantization_config = BnbQuantizationConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4")
```

To quantize your empty model with the selected configuration, you need to use [`~utils.load_and_quantize_model`]. 

```py
from accelerate.utils import load_and_quantize_model
quantized_model = load_and_quantize_model(empty_model, weights_location=weights_location, bnb_quantization_config=bnb_quantization_config)
```

### Saving and loading 8-bit model

You can save your 8-bit model with accelerate using [`~Accelerator.save_model`]. 

```py
from accelerate import Accelerator
accelerate = Accelerator()
new_weights_location = "path/to/save_directory"
accelerate.save_model(quantized_model, new_weights_location)

quantized_model_from_saved = load_and_quantize_model(empty_model, weights_location=new_weights_location, bnb_quantization_config=bnb_quantization_config, device_map = "auto")
```

Note that 4-bit model serialization is currently not supported.

### Offload modules to cpu and disk 

You can offload some modules to cpu/disk if you don't have enough space on the GPU to store the entire model on your GPUs.
This uses big model inference under the hood. Check this [documentation](https://huggingface.co/docs/accelerate/usage_guides/big_modeling) for more details. 

For 8-bit quantization, the selected modules will be converted to 8-bit precision. 

For 4-bit quantization, the selected modules will be kept in `torch_dtype` that the user passed in `BnbQuantizationConfig`.  We will add support to convert these offloaded modules in 4-bit when 4-bit serialization will be possible. 

 You just need to pass a custom `device_map` in order to offload modules on cpu/disk. The offload modules will be dispatched on the GPU when needed. Here's an example :

```py
device_map = {
    "transformer.wte": 0,
    "transformer.wpe": 0,
    "transformer.drop": 0,
    "transformer.h": "cpu",
    "transformer.ln_f": "disk",
    "lm_head": "disk",
}
```
### Fine-tune a quantized model

It is not possible to perform pure 8bit or 4bit training on these models. However, you can train these models by leveraging parameter efficient fine tuning methods (PEFT) and train for example adapters on top of them. Please have a look at [peft](https://github.com/huggingface/peft) library for more details.

Currently, you can't add adapters on top of any quantized model. However, with the official support of adapters with Transformers models, you can fine-tune quantized models. If you want to fine-tune a Transformers model , follow this [documentation](https://huggingface.co/docs/transformers/main_classes/quantization) instead. Check out this [demo](https://colab.research.google.com/drive/1VoYNfYDKcKRQRor98Zbf2-9VQTtGJ24k?usp=sharing) on how to fine-tune a 4-bit Transformers model. 

Note that you don’t need to pass `device_map` when loading the model for training. It will automatically load your model on your GPU. Please note that `device_map=auto` should be used for inference only.

### Example demo - running GPT2 1.5b on a Google Colab

Check out the Google Colab [demo](https://colab.research.google.com/drive/1T1pOgewAWVpR9gKpaEWw4orOrzPFb3yM?usp=sharing) for running quantized models on a GPT2 model. The GPT2-1.5B model checkpoint is in FP32 which uses 6GB of memory. After quantization, it uses 1.6GB with 8-bit modules and 1.2GB with 4-bit modules.


================================================
FILE: docs/source/usage_guides/sagemaker.md
================================================
<!--Copyright 2021 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Amazon SageMaker

Hugging Face and Amazon introduced new [Hugging Face Deep Learning Containers (DLCs)](https://github.com/aws/deep-learning-containers/blob/master/available_images.md#huggingface-training-containers) to
make it easier than ever to train Hugging Face Transformer models in [Amazon SageMaker](https://aws.amazon.com/sagemaker/).

## Getting Started

### Setup & Installation


Before you can run your Accelerate scripts on Amazon SageMaker you need to sign up for an AWS account. If you do not
have an AWS account yet learn more [here](https://docs.aws.amazon.com/sagemaker/latest/dg/gs-set-up.html).

After you have your AWS Account you need to install the `sagemaker` sdk for Accelerate with:

```bash
pip install "accelerate[sagemaker]" --upgrade
```

Accelerate currently uses the DLCs, with `transformers`, `datasets` and `tokenizers` pre-installed. Accelerate is not in the DLC yet (will soon be added!) so to use it within Amazon SageMaker you need to create a
`requirements.txt` in the same directory where your training script is located and add it as dependency:

```
accelerate
```

You should also add any other dependencies you have to this `requirements.txt`.


### Configure Accelerate

You can configure the launch configuration for Amazon SageMaker the same as you do for non SageMaker training jobs with
the Accelerate CLI:

```bash
accelerate config
# In which compute environment are you running? ([0] This machine, [1] AWS (Amazon SageMaker)): 1
```

Accelerate will go through a questionnaire about your Amazon SageMaker setup and create a config file you can edit.

<Tip>

    Accelerate is not saving any of your credentials.

</Tip>

### Prepare a Accelerate fine-tuning script

The training script is very similar to a training script you might run outside of SageMaker, but to save your model
after training you need to specify either `/opt/ml/model` or use `os.environ["SM_MODEL_DIR"]` as your save
directory. After training, artifacts in this directory are uploaded to S3:


```diff
- torch.save('/opt/ml/model`)
+ accelerator.save('/opt/ml/model')
```

<Tip warning={true}>

    SageMaker doesn’t support argparse actions. If you want to use, for example, boolean hyperparameters, you need to
    specify type as bool in your script and provide an explicit True or False value for this hyperparameter. [[REF]](https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html#prepare-a-pytorch-training-script).

</Tip>

### Launch Training

You can launch your training with Accelerate CLI with:

```
accelerate launch path_to_script.py --args_to_the_script
```

This will launch your training script using your configuration. The only thing you have to do is provide all the
arguments needed by your training script as named arguments.

**Examples**

<Tip>

    If you run one of the example scripts, don't forget to add `accelerator.save('/opt/ml/model')` to it.

</Tip>

```bash
accelerate launch ./examples/sagemaker_example.py
```

Outputs:

```
Configuring Amazon SageMaker environment
Converting Arguments to Hyperparameters
Creating Estimator
2021-04-08 11:56:50 Starting - Starting the training job...
2021-04-08 11:57:13 Starting - Launching requested ML instancesProfilerReport-1617883008: InProgress
.........
2021-04-08 11:58:54 Starting - Preparing the instances for training.........
2021-04-08 12:00:24 Downloading - Downloading input data
2021-04-08 12:00:24 Training - Downloading the training image..................
2021-04-08 12:03:39 Training - Training image download completed. Training in progress..
........
epoch 0: {'accuracy': 0.7598039215686274, 'f1': 0.8178438661710037}
epoch 1: {'accuracy': 0.8357843137254902, 'f1': 0.882249560632689}
epoch 2: {'accuracy': 0.8406862745098039, 'f1': 0.8869565217391304}
........
2021-04-08 12:05:40 Uploading - Uploading generated training model
2021-04-08 12:05:40 Completed - Training job completed
Training seconds: 331
Billable seconds: 331
You can find your model data at: s3://your-bucket/accelerate-sagemaker-1-2021-04-08-11-56-47-108/output/model.tar.gz
```

## Advanced Features

### Distributed Training: Data Parallelism

Set up the accelerate config by running `accelerate config` and answer the SageMaker questions and set it up.
To use SageMaker DDP, select it when asked 
`What is the distributed mode? ([0] No distributed training, [1] data parallelism):`.
Example config below:
```yaml
base_job_name: accelerate-sagemaker-1
compute_environment: AMAZON_SAGEMAKER
distributed_type: DATA_PARALLEL
ec2_instance_type: ml.p3.16xlarge
iam_role_name: xxxxx
image_uri: null
mixed_precision: fp16
num_machines: 1
profile: xxxxx
py_version: py10
pytorch_version: 2.5.0
region: us-east-1
transformers_version: 4.17.0
use_cpu: false
```

### Distributed Training: Model Parallelism

*currently in development, will be supported soon.*

### Python packages and dependencies

Accelerate currently uses the DLCs, with `transformers`, `datasets` and `tokenizers` pre-installed. If you
want to use different/other Python packages you can do this by adding them to the `requirements.txt`. These packages
will be installed before your training script is started.

### Local Training: SageMaker Local mode

The local mode in the SageMaker SDK allows you to run your training script locally inside the HuggingFace DLC (Deep Learning container) 
or using your custom container image. This is useful for debugging and testing your training script inside the final container environment.
Local mode uses Docker compose (*Note: Docker Compose V2 is not supported yet*). The SDK will handle the authentication against ECR
to pull the DLC to your local environment. You can emulate CPU (single and multi-instance) and GPU (single instance) SageMaker training jobs.

To use local mode, you need to set your `ec2_instance_type` to `local`.

```yaml
ec2_instance_type: local
```

### Advanced configuration

The configuration allows you to override parameters for the [Estimator](https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html).
These settings have to be applied in the config file and are not part of `accelerate config`. You can control many additional aspects of the training job, e.g. use Spot instances, enable network isolation and many more.

```yaml
additional_args:
  # enable network isolation to restrict internet access for containers
  enable_network_isolation: True
```

You can find all available configuration [here](https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html).

### Use Spot Instances

You can use Spot Instances e.g. using (see [Advanced configuration](#advanced-configuration)):
```yaml
additional_args:
  use_spot_instances: True
  max_wait: 86400
```

*Note: Spot Instances are subject to be terminated and training to be continued from a checkpoint. This is not handled in Accelerate out of the box. Contact us if you would like this feature.*

### Remote scripts: Use scripts located on Github

*undecided if feature is needed. Contact us if you would like this feature.*

================================================
FILE: docs/source/usage_guides/tracking.md
================================================
<!--Copyright 2022 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Experiment trackers

There are a large number of experiment tracking APIs available, however getting them all to work in a multi-processing environment can oftentimes be complex.
Accelerate provides a general tracking API that can be used to log useful items during your script through [`Accelerator.log`]

## Integrated Trackers

Currently `Accelerate` supports eight trackers out-of-the-box:

- TensorBoard
- WandB 
- Trackio
- CometML
- Aim
- MLFlow
- ClearML
- DVCLive

To use any of them, pass in the selected type(s) to the `log_with` parameter in [`Accelerate`]:
```python
from accelerate import Accelerator
from accelerate.utils import LoggerType

accelerator = Accelerator(log_with="all")  # For all available trackers in the environment
accelerator = Accelerator(log_with="wandb")
accelerator = Accelerator(log_with=["wandb", LoggerType.TENSORBOARD])
```

At the start of your experiment [`Accelerator.init_trackers`] should be used to setup your project, and potentially add any experiment hyperparameters to be logged:
```python
hps = {"num_iterations": 5, "learning_rate": 1e-2}
accelerator.init_trackers("my_project", config=hps)
```

When you are ready to log any data, [`Accelerator.log`] should be used.
A `step` can also be passed in to correlate the data with a particular step in the training loop.
```python
accelerator.log({"train_loss": 1.12, "valid_loss": 0.8}, step=1)
```

Once you've finished training, make sure to run [`Accelerator.end_training`] so that all the trackers can run their finish functionalities if they have any.
```python
accelerator.end_training()
```


A full example is below:
```python
from accelerate import Accelerator

accelerator = Accelerator(log_with="all")
config = {
    "num_iterations": 5,
    "learning_rate": 1e-2,
    "loss_function": str(my_loss_function),
}

accelerator.init_trackers("example_project", config=config)

my_model, my_optimizer, my_training_dataloader = accelerator.prepare(my_model, my_optimizer, my_training_dataloader)
device = accelerator.device
my_model.to(device)

for iteration in range(config["num_iterations"]):
    for step, batch in enumerate(my_training_dataloader):
        my_optimizer.zero_grad()
        inputs, targets = batch
        inputs = inputs.to(device)
        targets = targets.to(device)
        outputs = my_model(inputs)
        loss = my_loss_function(outputs, targets)
        accelerator.backward(loss)
        my_optimizer.step()
        accelerator.log({"training_loss": loss}, step=step)
accelerator.end_training()
```

If a tracker requires a directory to save data to, such as `TensorBoard`, then pass the directory path to `project_dir`. The `project_dir` parameter is useful 
when there are other configurations to be combined with in the [`~utils.ProjectConfiguration`] data class. For example, you can save the TensorBoard data to `project_dir` and everything else can be logged in the `logging_dir` parameter of [`~utils.ProjectConfiguration`: 

```python
accelerator = Accelerator(log_with="tensorboard", project_dir=".")

# use with ProjectConfiguration
config = ProjectConfiguration(project_dir=".", logging_dir="another/directory")
accelerator = Accelerator(log_with="tensorboard", project_config=config)
```

## Implementing Custom Trackers

To implement a new tracker to be used in `Accelerator`, a new one can be made through implementing the [`GeneralTracker`] class.
Every tracker must implement three functions and have three properties:
  - `__init__`: 
    - Should store a `run_name` and initialize the tracker API of the integrated library. 
    - If a tracker stores their data locally (such as TensorBoard), a `logging_dir` parameter can be added.
  - `store_init_configuration`: 
    - Should take in a `values` dictionary and store them as a one-time experiment configuration
  - `log`: 
    - Should take in a `values` dictionary and a `step`, and should log them to the run

  - `name` (`str`):
    - A unique string name for the tracker, such as `"wandb"` for the wandb tracker. 
    - This will be used for interacting with this tracker specifically
  - `requires_logging_directory` (`bool`):
    - Whether a `logging_dir` is needed for this particular tracker and if it uses one.
  - `tracker`: 
    - This should be implemented as a `@property` function 
    - Should return the internal tracking mechanism the library uses, such as the `run` object for `wandb`.

Each method should also utilize the [`state.PartialState`] class if the logger should only be executed on the main process for instance.

A brief example can be seen below with an integration with Weights and Biases, containing only the relevant information and logging just on 
the main process:
```python
from accelerate.tracking import GeneralTracker, on_main_process
from typing import Optional

import wandb


class MyCustomTracker(GeneralTracker):
    name = "wandb"
    requires_logging_directory = False

    @on_main_process
    def __init__(self, run_name: str):
        self.run_name = run_name
        run = wandb.init(self.run_name)

    @property
    def tracker(self):
        return self.run.run

    @on_main_process
    def store_init_configuration(self, values: dict):
        wandb.config(values)

    @on_main_process
    def log(self, values: dict, step: Optional[int] = None):
        wandb.log(values, step=step)
```

When you are ready to build your `Accelerator` object, pass in an **instance** of your tracker to [`Accelerator.log_with`] to have it automatically
be used with the API:

```python
tracker = MyCustomTracker("some_run_name")
accelerator = Accelerator(log_with=tracker)
```

These also can be mixed with existing trackers, including with `"all"`:

```python
tracker = MyCustomTracker("some_run_name")
accelerator = Accelerator(log_with=[tracker, "all"])
```

## Accessing the internal tracker 

If some custom interactions with a tracker might be wanted directly, you can quickly access one using the 
[`Accelerator.get_tracker`] method. Just pass in the string corresponding to a tracker's `.name` attribute 
and it will return that tracker on the main process.

This example shows doing so with wandb:

```python
wandb_tracker = accelerator.get_tracker("wandb")
```

From there you can interact with `wandb`'s `run` object like normal:

```python
wandb_tracker.log_artifact(some_artifact_to_log)
```

<Tip>
  Trackers built in Accelerate will automatically execute on the correct process, 
  so if a tracker is only meant to be ran on the main process it will do so 
  automatically.
</Tip>

If you want to truly remove Accelerate's wrapping entirely, you can 
achieve the same outcome with:

```python
wandb_tracker = accelerator.get_tracker("wandb", unwrap=True)
if accelerator.is_main_process:
    wandb_tracker.log_artifact(some_artifact_to_log)
```


## When a wrapper cannot work

If a library has an API that does not follow a strict `.log` with an overall dictionary such as Neptune.AI, logging can be done manually under an `if accelerator.is_main_process` statement:
```diff
  from accelerate import Accelerator
+ import neptune

  accelerator = Accelerator()
+ run = neptune.init_run(...)

  my_model, my_optimizer, my_training_dataloader = accelerate.prepare(my_model, my_optimizer, my_training_dataloader)
  device = accelerator.device
  my_model.to(device)

  for iteration in config["num_iterations"]:
      for batch in my_training_dataloader:
          my_optimizer.zero_grad()
          inputs, targets = batch
          inputs = inputs.to(device)
          targets = targets.to(device)
          outputs = my_model(inputs)
          loss = my_loss_function(outputs, targets)
          total_loss += loss
          accelerator.backward(loss)
          my_optimizer.step()
+         if accelerator.is_main_process:
+             run["logs/training/batch/loss"].log(loss)
```


================================================
FILE: docs/source/usage_guides/training_zoo.md
================================================
<!--Copyright 2022 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->

# Example Zoo

Below contains a non-exhaustive list of tutorials and scripts showcasing Accelerate.

## Official Accelerate Examples:

### Basic Examples

These examples showcase the base features of Accelerate and are a great starting point

- [Barebones NLP example](https://github.com/huggingface/accelerate/blob/main/examples/nlp_example.py)
- [Barebones distributed NLP example in a Jupyter Notebook](https://github.com/huggingface/notebooks/blob/main/examples/accelerate_examples/simple_nlp_example.ipynb)
- [Barebones computer vision example](https://github.com/huggingface/accelerate/blob/main/examples/cv_example.py)
- [Barebones distributed computer vision example in a Jupyter Notebook](https://github.com/huggingface/notebooks/blob/main/examples/accelerate_examples/simple_cv_example.ipynb)
- [Using Accelerate in Kaggle](https://www.kaggle.com/code/muellerzr/multi-gpu-and-accelerate)

### Feature Specific Examples

These examples showcase specific features that the Accelerate framework offers

- [Automatic memory-aware gradient accumulation](https://github.com/huggingface/accelerate/blob/main/examples/by_feature/automatic_gradient_accumulation.py)
- [Checkpointing states](https://github.com/huggingface/accelerate/blob/main/examples/by_feature/checkpointing.py)
- [Cross validation](https://github.com/huggingface/accelerate/blob/main/examples/by_feature/cross_validation.py)
- [DeepSpeed](https://github.com/huggingface/accelerate/blob/main/examples/by_feature/deepspeed_with_config_support.py)
- [Fully Sharded Data Parallelism](https://github.com/huggingface/accelerate/blob/main/examples/by_feature/fsdp_with_peak_mem_tracking.py)
- [Gradient accumulation](https://github.com/huggingface/accelerate/blob/main/examples/by_feature/gradient_accumulation.py)
- [Memory-aware batch size finder](https://github.com/huggingface/accelerate/blob/main/examples/by_feature/memory.py)
- [Metric Computation](https://github.com/huggingface/accelerate/blob/main/examples/by_feature/multi_process_metrics.py)
- [Using Trackers](https://github.com/huggingface/accelerate/blob/main/examples/by_feature/tracking.py)
- [Using Megatron-LM](https://github.com/huggingface/accelerate/blob/main/examples/by_feature/megatron_lm_gpt_pretraining.py)

### Full Examples 

These examples showcase every feature in Accelerate at once that was shown in "Feature Specific Examples"

- [Complete NLP example](https://github.com/huggingface/accelerate/blob/main/examples/complete_nlp_example.py)
- [Complete computer vision example](https://github.com/huggingface/accelerate/blob/main/examples/complete_cv_example.py)
- [Very complete and extensible vision example showcasing SLURM, hydra, and a very extensible usage of the framework](https://github.com/yuvalkirstain/PickScore)
- [Causal language model fine-tuning example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_clm_no_trainer.py)
- [Masked language model fine-tuning example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm_no_trainer.py)
- [Speech pretraining example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py)
- [Translation fine-tuning example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/translation/run_translation_no_trainer.py)
- [Text classification fine-tuning example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue_no_trainer.py)
- [Semantic segmentation fine-tuning example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py)
- [Question answering fine-tuning example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/question-answering/run_qa_no_trainer.py)
- [Beam search question answering fine-tuning example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py)
- [Multiple choice question answering fine-tuning example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/multiple-choice/run_swag_no_trainer.py)
- [Named entity recognition fine-tuning example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/token-classification/run_ner_no_trainer.py)
- [Image classification fine-tuning example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/image-classification/run_image_classification_no_trainer.py)
- [Summarization fine-tuning example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/summarization/run_summarization_no_trainer.py)
- [End-to-end examples on how to use AWS SageMaker integration of Accelerate](https://github.com/huggingface/notebooks/blob/main/sagemaker/22_accelerate_sagemaker_examples/README.md)
- [Megatron-LM examples for various NLp tasks](https://github.com/pacman100/accelerate-megatron-test) 

## Integration Examples 

These are tutorials from libraries that integrate with Accelerate: 

> Don't find your integration here? Make a PR to include it!

### Amphion
- [Training Text-to-Speech Models with Amphion](https://github.com/open-mmlab/Amphion/blob/main/egs/tts/README.md)
- [Training Singing Voice Conversion Models with Amphion](https://github.com/open-mmlab/Amphion/blob/main/egs/svc/README.md)
- [Training Vocoders with Amphion](https://github.com/open-mmlab/Amphion/blob/main/egs/vocoder/README.md)

### Catalyst

- [Distributed training tutorial with Catalyst](https://catalyst-team.github.io/catalyst/tutorials/ddp.html)

### DALLE2-pytorch 

- [Fine-tuning DALLE2](https://github.com/lucidrains/DALLE2-pytorch#usage)

### Diffusers

- [Performing textual inversion with diffusers](https://github.com/huggingface/diffusers/tree/main/examples/textual_inversion)
- [Training DreamBooth with diffusers](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth)

### fastai 

- [Distributed training from Jupyter Notebooks with fastai](https://docs.fast.ai/tutorial.distributed.html)
- [Basic distributed training examples with fastai](https://docs.fast.ai/examples/distributed_app_examples.html)

### GradsFlow

- [Auto Image Classification with GradsFlow](https://docs.gradsflow.com/en/latest/examples/nbs/01-ImageClassification/)

### imagen-pytorch 

- [Fine-tuning Imagen](https://github.com/lucidrains/imagen-pytorch#usage)

### Kornia

- [Fine-tuning vision models with Kornia's Trainer](https://kornia.readthedocs.io/en/latest/get-started/training.html)

### PyTorch Accelerated 

- [Quickstart distributed training tutorial with PyTorch Accelerated](https://pytorch-accelerated.readthedocs.io/en/latest/quickstart.html)

### PyTorch3D

- [Perform Deep Learning with 3D data](https://pytorch3d.org/tutorials/)

### Stable-Dreamfusion

- [Training with Stable-Dreamfusion to convert text to a 3D model](https://colab.research.google.com/drive/1MXT3yfOFvO0ooKEfiUUvTKwUkrrlCHpF?usp=sharing)

### Tez 

- [Leaf disease detection with Tez and Accelerate](https://www.kaggle.com/code/abhishek/tez-faster-and-easier-training-for-leaf-detection/notebook)

### trlx 

- [How to implement a sentiment learning task with trlx](https://github.com/CarperAI/trlx#example-how-to-add-a-task)

### Comfy-UI

- [Enabling using large Stable Diffusion Models in low-vram settings using Accelerate](https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/model_management.py#L291-L296)


## In Science

Below contains a non-exhaustive list of papers utilizing Accelerate. 

> Don't find your paper here? Make a PR to include it!

* Yuval Kirstain, Adam Polyak, Uriel Singer, Shahbuland Matiana, Joe Penna, Omer Levy: “Pick-a-Pic: An Open Dataset of User Preferences for Text-to-Image Generation”, 2023; [arXiv:2305.01569](http://huggingface.co/papers/2305.01569).
* Lei Wang, Wanyu Xu, Yihuai Lan, Zhiqiang Hu, Yunshi Lan, Roy Ka-Wei Lee, Ee-Peng Lim: “Plan-and-Solve Prompting: Improving Zero-Shot Chain-of-Thought Reasoning by Large Language Models”, 2023; [arXiv:2305.04091](http://huggingface.co/papers/2305.04091).
* Arthur Câmara, Claudia Hauff: “Moving Stuff Around: A study on efficiency of moving documents into memory for Neural IR models”, 2022; [arXiv:2205.08343](http://huggingface.co/papers/2205.08343).
* Ying Sheng, Lianmin Zheng, Binhang Yuan, Zhuohan Li, Max Ryabinin, Daniel Y. Fu, Zhiqiang Xie, Beidi Chen, Clark Barrett, Joseph E. Gonzalez, Percy Liang, Christopher Ré, Ion Stoica, Ce Zhang: “High-throughput Generative Inference of Large Language Models with a Single GPU”, 2023; [arXiv:2303.06865](http://huggingface.co/papers/2303.06865).
* Peter Melchior, Yan Liang, ChangHoon Hahn, Andy Goulding: “Autoencoding Galaxy Spectra I: Architecture”, 2022; [arXiv:2211.07890](http://huggingface.co/papers/2211.07890).
* Jiaao Chen, Aston Zhang, Mu Li, Alex Smola, Diyi Yang: “A Cheaper and Better Diffusion Language Model with Soft-Masked Noise”, 2023; [arXiv:2304.04746](http://huggingface.co/papers/2304.04746).
* Ayaan Haque, Matthew Tancik, Alexei A. Efros, Aleksander Holynski, Angjoo Kanazawa: “Instruct-NeRF2NeRF: Editing 3D Scenes with Instructions”, 2023; [arXiv:2303.12789](http://huggingface.co/papers/2303.12789).
* Luke Melas-Kyriazi, Christian Rupprecht, Iro Laina, Andrea Vedaldi: “RealFusion: 360° Reconstruction of Any Object from a Single Image”, 2023; [arXiv:2302.10663](http://huggingface.co/papers/2302.10663).
* Xiaoshi Wu, Keqiang Sun, Feng Zhu, Rui Zhao, Hongsheng Li: “Better Aligning Text-to-Image Models with Human Preference”, 2023; [arXiv:2303.14420](http://huggingface.co/papers/2303.14420).
* Yongliang Shen, Kaitao Song, Xu Tan, Dongsheng Li, Weiming Lu, Yueting Zhuang: “HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in HuggingFace”, 2023; [arXiv:2303.17580](http://huggingface.co/papers/2303.17580).
* Yue Yang, Wenlin Yao, Hongming Zhang, Xiaoyang Wang, Dong Yu, Jianshu Chen: “Z-LaVI: Zero-Shot Language Solver Fueled by Visual Imagination”, 2022; [arXiv:2210.12261](http://huggingface.co/papers/2210.12261).
* Sheng-Yen Chou, Pin-Yu Chen, Tsung-Yi Ho: “How to Backdoor Diffusion Models?”, 2022; [arXiv:2212.05400](http://huggingface.co/papers/2212.05400).
* Junyoung Seo, Wooseok Jang, Min-Seop Kwak, Jaehoon Ko, Hyeonsu Kim, Junho Kim, Jin-Hwa Kim, Jiyoung Lee, Seungryong Kim: “Let 2D Diffusion Model Know 3D-Consistency for Robust Text-to-3D Generation”, 2023; [arXiv:2303.07937](http://huggingface.co/papers/2303.07937).
* Or Patashnik, Daniel Garibi, Idan Azuri, Hadar Averbuch-Elor, Daniel Cohen-Or: “Localizing Object-level Shape Variations with Text-to-Image Diffusion Models”, 2023; [arXiv:2303.11306](http://huggingface.co/papers/2303.11306).
* Dídac Surís, Sachit Menon, Carl Vondrick: “ViperGPT: Visual Inference via Python Execution for Reasoning”, 2023; [arXiv:2303.08128](http://huggingface.co/papers/2303.08128).
* Chenyang Qi, Xiaodong Cun, Yong Zhang, Chenyang Lei, Xintao Wang, Ying Shan, Qifeng Chen: “FateZero: Fusing Attentions for Zero-shot Text-based Video Editing”, 2023; [arXiv:2303.09535](http://huggingface.co/papers/2303.09535).
* Sean Welleck, Jiacheng Liu, Ximing Lu, Hannaneh Hajishirzi, Yejin Choi: “NaturalProver: Grounded Mathematical Proof Generation with Language Models”, 2022; [arXiv:2205.12910](http://huggingface.co/papers/2205.12910).
* Elad Richardson, Gal Metzer, Yuval Alaluf, Raja Giryes, Daniel Cohen-Or: “TEXTure: Text-Guided Texturing of 3D Shapes”, 2023; [arXiv:2302.01721](http://huggingface.co/papers/2302.01721).
* Puijin Cheng, Li Lin, Yijin Huang, Huaqing He, Wenhan Luo, Xiaoying Tang: “Learning Enhancement From Degradation: A Diffusion Model For Fundus Image Enhancement”, 2023; [arXiv:2303.04603](http://huggingface.co/papers/2303.04603).
* Shun Shao, Yftah Ziser, Shay Cohen: “Erasure of Unaligned Attributes from Neural Representations”, 2023; [arXiv:2302.02997](http://huggingface.co/papers/2302.02997).
* Seonghyeon Ye, Hyeonbin Hwang, Sohee Yang, Hyeongu Yun, Yireun Kim, Minjoon Seo: “In-Context Instruction Learning”, 2023; [arXiv:2302.14691](http://huggingface.co/papers/2302.14691).
* Shikun Liu, Linxi Fan, Edward Johns, Zhiding Yu, Chaowei Xiao, Anima Anandkumar: “Prismer: A Vision-Language Model with An Ensemble of Experts”, 2023; [arXiv:2303.02506](http://huggingface.co/papers/2303.02506).
* Haoyu Chen, Zhihua Wang, Yang Yang, Qilin Sun, Kede Ma: “Learning a Deep Color Difference Metric for Photographic Images”, 2023; [arXiv:2303.14964](http://huggingface.co/papers/2303.14964).
* Van-Hoang Le, Hongyu Zhang: “Log Parsing with Prompt-based Few-shot Learning”, 2023; [arXiv:2302.07435](http://huggingface.co/papers/2302.07435).
* Keito Kudo, Yoichi Aoki, Tatsuki Kuribayashi, Ana Brassard, Masashi Yoshikawa, Keisuke Sakaguchi, Kentaro Inui: “Do Deep Neural Networks Capture Compositionality in Arithmetic Reasoning?”, 2023; [arXiv:2302.07866](http://huggingface.co/papers/2302.07866).
* Ruoyao Wang, Peter Jansen, Marc-Alexandre Côté, Prithviraj Ammanabrolu: “Behavior Cloned Transformers are Neurosymbolic Reasoners”, 2022; [arXiv:2210.07382](http://huggingface.co/papers/2210.07382).
* Martin Wessel, Tomáš Horych, Terry Ruas, Akiko Aizawa, Bela Gipp, Timo Spinde: “Introducing MBIB -- the first Media Bias Identification Benchmark Task and Dataset Collection”, 2023; [arXiv:2304.13148](http://huggingface.co/papers/2304.13148). DOI: [https://dx.doi.org/10.1145/3539618.3591882 10.1145/3539618.3591882].
* Hila Chefer, Yuval Alaluf, Yael Vinker, Lior Wolf, Daniel Cohen-Or: “Attend-and-Excite: Attention-Based Semantic Guidance for Text-to-Image Diffusion Models”, 2023; [arXiv:2301.13826](http://huggingface.co/papers/2301.13826).
* Marcio Fonseca, Yftah Ziser, Shay B. Cohen: “Factorizing Content and Budget Decisions in Abstractive Summarization of Long Documents”, 2022; [arXiv:2205.12486](http://huggingface.co/papers/2205.12486).
* Elad Richardson, Gal Metzer, Yuval Alaluf, Raja Giryes, Daniel Cohen-Or: “TEXTure: Text-Guided Texturing of 3D Shapes”, 2023; [arXiv:2302.01721](http://huggingface.co/papers/2302.01721).
* Tianxing He, Jingyu Zhang, Tianle Wang, Sachin Kumar, Kyunghyun Cho, James Glass, Yulia Tsvetkov: “On the Blind Spots of Model-Based Evaluation Metrics for Text Generation”, 2022; [arXiv:2212.10020](http://huggingface.co/papers/2212.10020).
* Ori Ram, Yoav Levine, Itay Dalmedigos, Dor Muhlgay, Amnon Shashua, Kevin Leyton-Brown, Yoav Shoham: “In-Context Retrieval-Augmented Language Models”, 2023; [arXiv:2302.00083](http://huggingface.co/papers/2302.00083).
* Dacheng Li, Rulin Shao, Hongyi Wang, Han Guo, Eric P. Xing, Hao Zhang: “MPCFormer: fast, performant and private Transformer inference with MPC”, 2022; [arXiv:2211.01452](http://huggingface.co/papers/2211.01452).
* Baolin Peng, Michel Galley, Pengcheng He, Chris Brockett, Lars Liden, Elnaz Nouri, Zhou Yu, Bill Dolan, Jianfeng Gao: “GODEL: Large-Scale Pre-Training for Goal-Directed Dialog”, 2022; [arXiv:2206.11309](http://huggingface.co/papers/2206.11309).
* Egil Rønningstad, Erik Velldal, Lilja Øvrelid: “Entity-Level Sentiment Analysis (ELSA): An exploratory task survey”, 2023, Proceedings of the 29th International Conference on Computational Linguistics, 2022, pages 6773-6783; [arXiv:2304.14241](http://huggingface.co/papers/2304.14241).
* Charlie Snell, Ilya Kostrikov, Yi Su, Mengjiao Yang, Sergey Levine: “Offline RL for Natural Language Generation with Implicit Language Q Learning”, 2022; [arXiv:2206.11871](http://huggingface.co/papers/2206.11871).
* Zhiruo Wang, Shuyan Zhou, Daniel Fried, Graham Neubig: “Execution-Based Evaluation for Open-Domain Code Generation”, 2022; [arXiv:2212.10481](http://huggingface.co/papers/2212.10481).
* Minh-Long Luu, Zeyi Huang, Eric P. Xing, Yong Jae Lee, Haohan Wang: “Expeditious Saliency-guided Mix-up through Random Gradient Thresholding”, 2022; [arXiv:2212.04875](http://huggingface.co/papers/2212.04875).
* Jun Hao Liew, Hanshu Yan, Daquan Zhou, Jiashi Feng: “MagicMix: Semantic Mixing with Diffusion Models”, 2022; [arXiv:2210.16056](http://huggingface.co/papers/2210.16056).
* Yaqing Wang, Subhabrata Mukherjee, Xiaodong Liu, Jing Gao, Ahmed Hassan Awadallah, Jianfeng Gao: “LiST: Lite Prompted Self-training Makes Parameter-Efficient Few-shot Learners”, 2021; [arXiv:2110.06274](http://huggingface.co/papers/2110.06274).


================================================
FILE: examples/README.md
================================================
<!---
Copyright 2021 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

# In this folder we showcase various full examples using 🤗 Accelerate

## Simple NLP example

The [nlp_example.py](./nlp_example.py) script is a simple example to train a Bert model on a classification task ([GLUE's MRPC](https://www.microsoft.com/en-us/download/details.aspx?id=52398)).

Prior to running it you should install 🤗 Dataset and 🤗 Transformers:

```bash
pip install datasets evaluate transformers
```

The same script can be run in any of the following configurations:
- single CPU or single GPU
- multi CPUs
- multi GPUs (using PyTorch distributed mode)
- (multi) TPUs
- fp16 (mixed-precision) or fp32 (normal precision)

To run it in each of these various modes, use the following commands:
- single CPU:
    * from a server without GPU
        ```bash
        python ./nlp_example.py
        ```
    * from any server by passing `cpu=True` to the `Accelerator`.
        ```bash
        python ./nlp_example.py --cpu
        ```
    * from any server with Accelerate launcher
        ```bash
        accelerate launch --cpu ./nlp_example.py
        ```
- single GPU:
    ```bash
    python ./nlp_example.py  # from a server with a GPU
    ```
- with fp16 (mixed-precision)
    * from any server by passing `mixed_precison=fp16` to the `Accelerator`.
        ```bash
        python ./nlp_example.py --mixed_precision fp16
        ```
    * from any server with Accelerate launcher
        ```bash
        accelerate launch --mixed_precision fp16 ./nlp_example.py
- multi CPUs (requires Open MPI, Intel MPI, or MVAPICH)
    * With Accelerate config and launcher, execute the following from node 0:
        ```bash
        accelerate config  # Select to have accelerate launch mpirun
        accelerate launch ./nlp_example.py  # This will run the script on each server
        ```
    * With Intel MPI:
        ```bash
        export CCL_WORKER_COUNT=1
        export MASTER_ADDR=xxx.xxx.xxx.xxx #node0 ip
        mpirun -f hostfile -n 16 -ppn 4 python ./nlp_example.py
        ```
- multi GPUs (using PyTorch distributed mode)
    * With Accelerate config and launcher
        ```bash
        accelerate config  # This will create a config file on your server
        accelerate launch ./nlp_example.py  # This will run the script on your server
        ```
    * With traditional PyTorch launcher (`python -m torch.distributed.run` can be used instead of `torchrun`)
        ```bash
        torchrun --nproc_per_node 2 ./nlp_example.py
        ```
- multi GPUs, multi node (several machines, using PyTorch distributed mode)
    * With Accelerate config and launcher, on each machine:
        ```bash
        accelerate config  # This will create a config file on each server
        accelerate launch ./nlp_example.py  # This will run the script on each server
        ```
    * With PyTorch launcher only (`python -m torch.distributed.run` can be used instead of `torchrun`). Run this command on each node:
        ```bash
        torchrun \ # python -m torch.distributed.run 
            --nproc_per_node 2 \
            --nnodes 2 \
            --rdzv_id 2299 \ # A unique job id 
            --rdzv_backend c10d \
            --rdzv_endpoint master_node_ip_address:29500 \
            ./nlp_example.py
        ```
- (multi) TPUs
    * With Accelerate config and launcher
        ```bash
        accelerate config  # This will create a config file on your TPU server
        accelerate launch ./nlp_example.py  # This will run the script on each server
        ```
    * In PyTorch:
        Add an `xmp.spawn` line in your script as you usually do.


## Simple vision example

The [cv_example.py](./cv_example.py) script is a simple example to fine-tune a ResNet-50 on a classification task ([Oxford-IIT Pet Dataset](https://www.robots.ox.ac.uk/~vgg/data/pets/)).

The same script can be run in any of the following configurations:
- single CPU or single GPU
- multi CPUs
- multi GPUs (using PyTorch distributed mode)
- (multi) TPUs
- fp16 (mixed-precision) or fp32 (normal precision)

Prior to running it you should install timm and torchvision:

```bash
pip install timm torchvision
```

and you should download the data with the following commands:

```bash
wget https://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz
tar -xzf images.tar.gz
```

To run it in each of these various modes, use the following commands:
- single CPU:
    * from a server without GPU
        ```bash
        python ./cv_example.py --data_dir path_to_data
        ```
    * from any server by passing `cpu=True` to the `Accelerator`.
        ```bash
        python ./cv_example.py --data_dir path_to_data --cpu
        ```
    * from any server with Accelerate launcher
        ```bash
        accelerate launch --cpu ./cv_example.py --data_dir path_to_data
        ```
- single GPU:
    ```bash
    python ./cv_example.py  # from a server with a GPU
    ```
- with fp16 (mixed-precision)
    * from any server by passing `mixed_precison=fp16` to the `Accelerator`.
        ```bash
        python ./cv_example.py --data_dir path_to_data --mixed_precison fp16
        ```
    * from any server with Accelerate launcher
        ```bash
        accelerate launch --mixed_precison fp16 ./cv_example.py --data_dir path_to_data
- multi CPUs (requires Open MPI, Intel MPI, or MVAPICH)
    * With Accelerate config and launcher, run the following from node 0:
        ```bash
        accelerate config --config_file config.yaml  # Select to have accelerate launch mpirun
        accelerate launch ./cv_example.py --data_dir path_to_data # This will run the script on each server
        ```
    * With Intel MPI, execute mpirun from node 0:
        ```bash
        export CCL_WORKER_COUNT=1
        export MASTER_ADDR=xxx.xxx.xxx.xxx #node0 ip
        mpirun -f hostfile -n 16 -ppn 4 python ./cv_example.py --data_dir path_to_data
        ```
- multi GPUs (using PyTorch distributed mode)
    * With Accelerate config and launcher
        ```bash
        accelerate config --config_file config.yaml  # This will create a config file on your server to `config.yaml`
        accelerate launch --config_file config.yaml ./cv_example.py --data_dir path_to_data  # This will run the script on your server
        ```
    * With traditional PyTorch launcher (`python -m torch.distributed.run` can be used instead of `torchrun`)
        ```bash
        torchrun --nproc_per_node 2 ./cv_example.py --data_dir path_to_data
        ```
- multi GPUs, multi node (several machines, using PyTorch distributed mode)
    * With Accelerate config and launcher, on each machine:
        ```bash
        accelerate config --config_file config.yaml  # This will create a config file on your server to `config.yaml`
        accelerate launch --config_file config.yaml ./cv_example.py --data_dir path_to_data  # This will run the script on each server
        ```
    * With PyTorch launcher only (`python -m torch.distributed.run` can be used instead of `torchrun`). Run this command on each node:
        ```bash
        torchrun \ # python -m torch.distributed.run
            --nproc_per_node 2 \
            --nnodes 2 \
            --rdzv_id 2299 \ # A unique job id 
            --rdzv_backend c10d \
            --rdzv_endpoint master_node_ip_address:29500 \
            ./cv_example.py --data_dir path_to_data
        ```
- (multi) TPUs
    * With Accelerate config and launcher
        ```bash
        accelerate config --config_file config.yaml  # This will create a config file on your server to `config.yaml`
        accelerate launch --config_file config.yaml ./cv_example.py --data_dir path_to_data  # This will run the script on each server
        ```
    * In PyTorch:
        Add an `xmp.spawn` line in your script as you usually do.

### Simple vision example (GANs)

- [huggan project](https://github.com/huggingface/community-events/tree/main/huggan)


### Using AWS SageMaker integration
- [Examples showcasing AWS SageMaker integration of 🤗 Accelerate.](https://github.com/pacman100/accelerate-aws-sagemaker)

## Configuration zoo
In [/config_yaml_templates](./config_yaml_templates/) we have a variety of *minimal* `config.yaml` templates and examples to help you learn
how to create your own configuration files depending on the scenario. 

## SLURM Scripts 
In [/slurm/submit_multigpu.sh](./slurm/submit_multigpu.sh) and [/slurm/submit_multinode.sh](./slurm/submit_multinode.sh) we present two scripts for running the examples on a machine with [SLURM](https://slurm.schedmd.com/documentation.html) workload manager. 

In [/slurm/submit_multigpu.sh](./slurm/submit_multigpu.sh) the only parameter in the launcher that needs to be modified is `--num_processes`, which determines the number of GPUs we will use. In this case, using the environment variable `$SLURM_GPUS`, we indicate that we want to utilize all the GPUs available on the node we have requested. 

In [/slurm/submit_multinode.sh](./slurm/submit_multinode.sh) we must specify the number of nodes that will be part of the training (`--num_machines`), how many GPUs we will use in total (`--num_processes`), the [`backend`](https://pytorch.org/docs/stable/elastic/run.html#note-on-rendezvous-backend), `--main_process_ip` which will be the address the master node and the `--main_process_port`.

In [/slurm/submit_multicpu.sh](./slurm/submit_multicpu.sh) we must specify the number of nodes that will be part of the training (`--num_machines`), how many CPU processes we will use in total (`--num_processes`), the [`backend`](https://pytorch.org/docs/stable/elastic/run.html#note-on-rendezvous-backend), `--main_process_ip` which will be the address the master node and the `--main_process_port`. `mpirun_hostfile` specifies to run the job using MPIRun.

In both scripts, we run `activateEnvironment.sh` at the beginning. This script should contain the necessary instructions to initialize the environment for execution. Below, we show an example that loads the necessary libraries ([Environment modules](https://github.com/cea-hpc/modules)), activates the Python environment, and sets up various environment variables, most of them to run the scripts in offline mode in case we don't have internet connection from the cluster.

```bash
# activateEnvironment.sh 
module purge
module load anaconda3/2020.02 cuda/10.2 cudnn/8.0.5 nccl/2.9.9 arrow/7.0.0 openmpi
source activate /home/nct01/nct01328/pytorch_antoni_local

export HF_HOME=/gpfs/projects/nct01/nct01328/
export HF_LOCAL_HOME=/gpfs/projects/nct01/nct01328/HF_LOCAL
export HF_DATASETS_OFFLINE=1
export TRANSFORMERS_OFFLINE=1
export PYTHONPATH=/home/nct01/nct01328/transformers-in-supercomputers:$PYTHONPATH 
export GPUS_PER_NODE=4
```

## Simple Multi-GPU Hardware Launcher (using an external platform)

[multigpu_remote_launcher.py](./multigpu_remote_launcher.py) is a minimal script that demonstrates launching accelerate
on multiple remote GPUs, and with automatic hardware environment and dependency setup for reproducibility. You can
easily customize the training function used, training arguments, hyperparameters, and type of compute hardware, and then
run the script to automatically launch multi GPU training on remote hardware.

This script uses [Runhouse](https://github.com/run-house/runhouse) to launch on self-hosted hardware (e.g. in your own
cloud account or on-premise cluster) but there are other options for running remotely as well. Runhouse can be installed
with `pip install runhouse`, and you can refer to
[hardware setup](https://runhouse-docs.readthedocs-hosted.com/en/latest/api/python/cluster.html#hardware-setup)
for hardware setup instructions, or this
[Colab tutorial](https://colab.research.google.com/drive/1qVwYyLTCPYPSdz9ZX7BZl9Qm0A3j7RJe) for a more in-depth walkthrough.

## Simple fine-tuning script that works on TPU

[finetune_lm_tpu.py](./finetune_lm_tpu.py) is a classical language modeling generation fine tuning script that has been
adapted to run best on TPUs. It has been successfully run and tested on a TPU v5 litepod-8, and it shows how it is
possible to perform a fine-tuning task on such hardware thanks to accelerate and FSDPv2, using transformers and Torch XLA.

## Finer Examples

While the first two scripts are extremely barebones when it comes to what you can do with accelerate, more advanced features are documented in two other locations.

### `by_feature` examples

These scripts are *individual* examples highlighting one particular feature or use-case within Accelerate. They all stem from the [nlp_example.py](./nlp_example.py) script, and any changes or modifications is denoted with a `# New Code #` comment.

Read the README.md file located in the `by_feature` folder for more information.

### `complete_*` examples

These two scripts contain *every* single feature currently available in Accelerate in one place, as one giant script.

New arguments that can be passed include:

- `checkpointing_steps`, whether the various states should be saved at the end of every `n` steps, or `"epoch"` for each epoch. States are then saved to folders named `step_{n}` or `epoch_{n}`
- `resume_from_checkpoint`, should be used if you want to resume training off of a previous call to the script and passed a `checkpointing_steps` to it.
- `with_tracking`, should be used if you want to log the training run using all available experiment trackers in your environment. Currently supported trackers include TensorBoard, Weights and Biases, and CometML.


================================================
FILE: examples/alst_ulysses_sequence_parallelism/README.md
================================================
# Deepspeed's ALST/Ulysses sequence parallelism

This is an example of the use of Ulysses Sequence Parallelism, which uses attention head parallelism and is part of the Arctic Long Sequence Training project at [ArcticTraining](https://github.com/snowflakedb/ArcticTraining). [This paper](https://arxiv.org/abs/2506.13996) goes into the details of this protocol.

For nuances of usage please refer to the main HF Accelerate tutorial on [Context Parallelism](https://huggingface.co/docs/accelerate/en/concept_guides/context_parallelism).

You need to use at least `2` gpus to enable ALST/Ulysses sequence parallelism.

To run the example with `4` gpus:

```bash
bash ./sp-alst.sh
```

Change `4` to the desired sequence parallelism degree in these 2 files:
```
sp-alst.accelerate-config.yml:num_processes: 4
sp-alst.py:    sp_size=4,
```


================================================
FILE: examples/alst_ulysses_sequence_parallelism/sp-alst.accelerate-config.yml
================================================
compute_environment: LOCAL_MACHINE
deepspeed_config:
  deepspeed_config_file: sp-alst.ds-config.json
  zero3_init_flag: false
distributed_type: DEEPSPEED
machine_rank: 0
main_training_function: main
num_machines: 1
num_processes: 4
rdzv_backend: static
same_network: true
use_cpu: false

================================================
FILE: examples/alst_ulysses_sequence_parallelism/sp-alst.ds-config.json
================================================
{
    "bf16": {
        "enabled": true
    },
    "zero_optimization": {
        "stage": 3
    },
    "gradient_accumulation_steps": 1,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "seq_parallel_communication_data_type": "bf16"
}

================================================
FILE: examples/alst_ulysses_sequence_parallelism/sp-alst.py
================================================
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
from deepspeed.runtime.utils import move_to_device
from transformers import AutoModelForCausalLM, AutoTokenizer

from accelerate import Accelerator
from accelerate.utils import ParallelismConfig, set_seed
from accelerate.utils.dataclasses import DeepSpeedSequenceParallelConfig


set_seed(42)

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
# to run the example faster switch to the random model
# model_name = "hf-internal-testing/tiny-random-LlamaForCausalLM"

micro_batch_size = 1

parallelism_config = ParallelismConfig(
    sp_backend="deepspeed",
    sp_size=4,
    sp_handler=DeepSpeedSequenceParallelConfig(
        sp_seq_length=256,
        sp_seq_length_is_variable=True,
        sp_attn_implementation="sdpa",
    ),
)

accelerator = Accelerator(
    parallelism_config=parallelism_config,
    #    log_with="wandb",  # enable to log into wandb
)
accelerator.init_trackers(
    project_name="ulysses-accelerate",
    config={},
    init_kwargs={"wandb": dict(entity="yak", name="deepspeed")},
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# 2 quick rough datasets to demonstrate the workings
if 1:  # real dataset
    from datasets import load_dataset

    ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft[:12]")

    # this is a quick example, it should be made more efficient to be used in real application
    def convert(ex):
        texts = tokenizer.apply_chat_template(conversation=ex["messages"], tokenize=False)
        tokenized_dict = tokenizer(texts, max_length=256, padding=True, truncation=True)
        return tokenized_dict

    ds = ds.map(convert, batched=False, remove_columns=["prompt", "prompt_id", "messages"])

    def collate_fn(batch):
        input_ids = torch.tensor(batch[0]["input_ids"]).unsqueeze(0)
        attention_mask = torch.tensor(batch[0]["attention_mask"]).unsqueeze(0)
        position_ids = torch.arange(input_ids.shape[1]).unsqueeze(0)
        return dict(
            input_ids=input_ids,
            position_ids=position_ids,
            labels=input_ids,
            attention_mask=attention_mask,
        )

    dl = torch.utils.data.DataLoader(
        ds, batch_size=micro_batch_size, collate_fn=collate_fn, drop_last=True, shuffle=False
    )

else:  # fake dataset
    samples = 16
    seqlen = 256
    input_ids = torch.arange(1, seqlen * samples + 1).view(-1, seqlen) + 100
    position_ids = torch.arange(seqlen * samples).view(-1, seqlen)

    ds = torch.utils.data.TensorDataset(input_ids, position_ids)

    def collate_fn(batch):
        input_ids, position_ids = batch[0]
        return dict(
            input_ids=input_ids.unsqueeze(0),
            position_ids=position_ids.unsqueeze(0),
            labels=input_ids.unsqueeze(0),
        )

    dl = torch.utils.data.DataLoader(ds, batch_size=micro_batch_size, collate_fn=collate_fn)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

rank = torch.distributed.get_rank()

if rank == 0:
    print(f"DL orig: {len(dl)} samples")

model, optimizer, dl = accelerator.prepare(model, optimizer, dl)

if rank == 0:
    print(f"DL w/ adapter: {len(dl)} samples")

sp_size = parallelism_config.sp_size if parallelism_config else 1
if sp_size > 1:
    sp_group = accelerator.torch_device_mesh["sp"].get_group()
    sp_world_size = parallelism_config.sp_size

unwrapped_model = accelerator.unwrap_model(model)

# Normal training loop
for iter, batch in enumerate(dl):
    optimizer.zero_grad()

    if rank == 0:
        print(f"batch {iter}: seqlen: {len(batch['input_ids'][0])}")
    batch = move_to_device(batch, model.device)

    # The model automatically receives shift_labels via **kwargs and uses it for loss computation.
    # Both standard transformer models and Liger-patched models handle this correctly.
    outputs = model(**batch)
    loss = outputs.loss

    if sp_size > 1:
        # differentiable weighted per-shard-loss aggregation across ranks
        losses_per_rank = torch.distributed.nn.functional.all_gather(loss, group=sp_group)
        # special dealing with SFT that has prompt tokens that aren't used in loss computation
        good_tokens = (batch["shift_labels"] != -100).view(-1).sum()
        good_tokens_per_rank = torch.distributed.nn.functional.all_gather(good_tokens, group=sp_group)
        total_loss = sum(losses_per_rank[rank] * good_tokens_per_rank[rank] for rank in range(sp_world_size))
        total_good_tokens = sum(good_tokens_per_rank)
        loss = total_loss / max(total_good_tokens, 1)

    if rank == 0:
        accelerator.print(f"{iter}: {loss=}")
    accelerator.log(dict(train_loss=loss, step=iter))

    accelerator.backward(loss)
    optimizer.step()

accelerator.end_training()


================================================
FILE: examples/alst_ulysses_sequence_parallelism/sp-alst.sh
================================================
export MASTER_ADDR=localhost
export MASTER_PORT=9998
python -u -m accelerate.commands.launch \
    --rdzv_conf "rdzv_backend=c10d,rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT" \
    --main_process_ip $MASTER_ADDR \
    --main_process_port $MASTER_PORT \
    --config_file sp-alst.accelerate-config.yml \
    sp-alst.py


================================================
FILE: examples/by_feature/README.md
================================================
# What are these scripts?

All scripts in this folder originate from the `nlp_example.py` file, as it is a very simplistic NLP training example using Accelerate with zero extra features.

From there, each further script adds in just **one** feature of Accelerate, showing how you can quickly modify your own scripts to implement these capabilities.

A full example with all of these parts integrated together can be found in the `complete_nlp_example.py` script and `complete_cv_example.py` script.

Adjustments to each script from the base `nlp_example.py` file can be found quickly by searching for "# New Code #"

## Example Scripts by Feature and their Arguments

### Base Example (`../nlp_example.py`)

- Shows how to use `Accelerator` in an extremely simplistic PyTorch training loop
- Arguments available:
  - `mixed_precision`, whether to use mixed precision. ("no", "fp16", or "bf16")
  - `cpu`, whether to train using only the CPU. (yes/no/1/0)

All following scripts also accept these arguments in addition to their added ones.

These arguments should be added at the end of any method for starting the python script (such as `python`, `accelerate launch`, `python -m torch.distributed.run`), such as:

```bash
accelerate launch ../nlp_example.py --mixed_precision fp16 --cpu 0
```

### Checkpointing and Resuming Training (`checkpointing.py`)

- Shows how to use `Accelerator.save_state` and `Accelerator.load_state` to save or continue training
- **It is assumed you are continuing off the same training script**
- Arguments available:
  - `checkpointing_steps`, after how many steps the various states should be saved. ("epoch", 1, 2, ...)
  - `output_dir`, where saved state folders should be saved to, default is current working directory
  - `resume_from_checkpoint`, what checkpoint folder to resume from. ("epoch_0", "step_22", ...)

These arguments should be added at the end of any method for starting the python script (such as `python`, `accelerate launch`, `python -m torchrun`), such as:

(Note, `resume_from_checkpoint` assumes that we've ran the script for one epoch with the `--checkpointing_steps epoch` flag)

```bash
accelerate launch ./checkpointing.py --checkpointing_steps epoch output_dir "checkpointing_tutorial" --resume_from_checkpoint "checkpointing_tutorial/epoch_0"
```

### Cross Validation (`cross_validation.py`)

- Shows how to use `Accelerator.free_memory` and run cross validation efficiently with `datasets`.
- Arguments available:
  - `num_folds`, the number of folds the training dataset should be split into.

These arguments should be added at the end of any method for starting the python script (such as `python`, `accelerate launch`, `python -m torchrun`), such as:

```bash
accelerate launch ./cross_validation.py --num_folds 2
```

### Experiment Tracking (`tracking.py`)

- Shows how to use `Accelerate.init_trackers` and `Accelerator.log`
- Can be used with Weights and Biases, TensorBoard, or CometML.
- Arguments available:
  - `with_tracking`, whether to load in all available experiment trackers from the environment.

These arguments should be added at the end of any method for starting the python script (such as `python`, `accelerate launch`, `python -m torchrun`), such as:

```bash
accelerate launch ./tracking.py --with_tracking
```

### Gradient Accumulation (`gradient_accumulation.py`)

- Shows how to use `Accelerator.no_sync` to prevent gradient averaging in a distributed setup.
- Arguments available:
  - `gradient_accumulation_steps`, the number of steps to perform before the gradients are accumulated and the optimizer and scheduler are stepped + zero_grad

These arguments should be added at the end of any method for starting the python script (such as `python`, `accelerate launch`, `python -m torchrun`), such as:

```bash
accelerate launch ./gradient_accumulation.py --gradient_accumulation_steps 5
```

### LocalSGD (`local_sgd.py`)
- Shows how to use `Accelerator.no_sync` to prevent gradient averaging in a distributed setup. However, unlike gradient accumulation, this method does not change the effective batch size. Local SGD can be combined with gradient accumulation.

These arguments should be added at the end of any method for starting the python script (such as `python`, `accelerate launch`, `python -m torchrun`), such as:

```bash
accelerate launch ./local_sgd.py --local_sgd_steps 4
```

### DDP Communication Hook (`ddp_comm_hook.py`)

- Shows how to use DDP Communication Hooks to control and optimize gradient communication across workers in a DistributedDataParallel setup.
- Arguments available:
  - `ddp_comm_hook`, the type of DDP communication hook to use. Choose between `no`, `fp16`, `bf16`, `power_sgd`, and `batched_power_sgd`.

These arguments should be added at the end of any method for starting the python script (such as `accelerate launch`, `python -m torch.distributed.run`), such as:

```bash
accelerate launch ./ddp_comm_hook.py --mixed_precision fp16 --ddp_comm_hook power_sgd
```

### Profiler (`profiler.py`)

- Shows how to use the profiling capabilities of `Accelerate` to profile PyTorch models during training.
- Uses the `ProfileKwargs` handler to customize profiling options, including activities, scheduling, and additional profiling options.
- Can generate and save profiling traces in JSON format for visualization in Chrome's tracing tool.

Arguments available:
- `--record_shapes`: If passed, records shapes for profiling.
- `--profile_memory`: If passed, profiles memory usage.
- `--with_stack`: If passed, profiles stack traces.
- `--with_flops`: If passed, profiles floating point operations (FLOPS).
- `--output_trace_dir`: If specified, saves the profiling trace to the given dir in JSON format.
- `--cpu`: If passed, trains on the CPU instead of GPU.

These arguments should be added at the end of any method for starting the Python script (such as `python`, `accelerate launch`, `python -m torchrun`), such as:

```bash
accelerate launch ./profiler.py --record_shapes --profile_memory --with_flops --output_trace_dir "profiler"
```


================================================
FILE: examples/by_feature/automatic_gradient_accumulation.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os

# New Code #
import evaluate
import torch
from datasets import load_dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed

from accelerate import Accelerator
from accelerate.utils import find_executable_batch_size


########################################################################
# This is a fully working simple example to use Accelerate,
# specifically showcasing how to combine both the gradient accumulation
# and automatic batch size finder utilities of Accelerate to perfrom
# automatic gradient accumulation
#
# This example trains a Bert base model on GLUE MRPC
# in any of the following settings (with the same script):
#   - single CPU or single GPU
#   - multi GPUS (using PyTorch distributed mode)
#   - (multi) TPUs
#   - fp16 (mixed-precision) or fp32 (normal precision)
#
# New additions from the base script can be found quickly by
# looking for the # New Code # tags
#
# To run it in each of these various modes, follow the instructions
# in the readme for examples:
# https://github.com/huggingface/accelerate/tree/main/examples
#
########################################################################

EVAL_BATCH_SIZE = 32


def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):
    """
    Creates a set of `DataLoader`s for the `glue` dataset,
    using "bert-base-cased" as the tokenizer.

    Args:
        accelerator (`Accelerator`):
            An `Accelerator` object
        batch_size (`int`, *optional*):
            The batch size for the train and validation DataLoaders.
    """
    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    datasets = load_dataset("glue", "mrpc")

    def tokenize_function(examples):
        # max_length=None => use the model max length (it's actually the default)
        outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
        return outputs

    # Apply the method we just defined to all the examples in all the splits of the dataset
    # starting with the main process first:
    with accelerator.main_process_first():
        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            remove_columns=["idx", "sentence1", "sentence2"],
        )

    # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
    # transformers library
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

    def collate_fn(examples):
        # When using mixed precision we want round multiples of 8/16
        if accelerator.mixed_precision == "fp8":
            pad_to_multiple_of = 16
        elif accelerator.mixed_precision != "no":
            pad_to_multiple_of = 8
        else:
            pad_to_multiple_of = None

        return tokenizer.pad(
            examples,
            padding="longest",
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors="pt",
        )

    # Instantiate dataloaders.
    train_dataloader = DataLoader(
        tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size
    )
    eval_dataloader = DataLoader(
        tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
    )

    return train_dataloader, eval_dataloader


# For testing only
if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
    from accelerate.test_utils.training import mocked_dataloaders

    get_dataloaders = mocked_dataloaders  # noqa: F811


def training_function(config, args):
    # For testing only
    if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
        config["num_epochs"] = 2
    # Initialize accelerator
    accelerator = Accelerator(cpu=args.cpu, mixed_precision=args.mixed_precision)
    # Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
    lr = config["lr"]
    num_epochs = int(config["num_epochs"])
    seed = int(config["seed"])
    observed_batch_size = int(config["batch_size"])

    metric = evaluate.load("glue", "mrpc")

    # New Code #
    # We use the `find_executable_batch_size` decorator, passing in the desired observed batch size
    # to train on. If a device OOM error occurs, it will retry this loop cutting the batch size in
    # half each time. From this, we can calculate the number of gradient accumulation steps needed
    # and modify the Accelerator object as a result
    @find_executable_batch_size(starting_batch_size=int(observed_batch_size))
    def inner_training_loop(batch_size):
        # Since we need to modify the outside accelerator object, we need to bring it
        # to the local scope
        nonlocal accelerator

        # We can calculate the number of gradient accumulation steps based on the current
        # batch size vs the starting batch size
        num_gradient_accumulation_steps = observed_batch_size // batch_size

        # And then set it in the Accelerator directly:
        accelerator.gradient_accumulation_steps = num_gradient_accumulation_steps

        # Next we need to free all of the stored model references in the Accelerator each time
        accelerator.free_memory()

        # And set the seed so our results are reproducable each reset
        set_seed(seed)

        # Instantiate the model (we build the model here so that the seed also control new weights initialization)
        model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)

        # We could avoid this line since the accelerator is set with `device_placement=True` (default value).
        # Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
        # creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
        model = model.to(accelerator.device)

        # Instantiate optimizer
        optimizer = AdamW(params=model.parameters(), lr=lr)
        train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)

        # Instantiate scheduler
        lr_scheduler = get_linear_schedule_with_warmup(
            optimizer=optimizer,
            num_warmup_steps=100,
            num_training_steps=(len(train_dataloader) * num_epochs),
        )

        # Prepare everything
        # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
        # prepare method.
        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
            model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
        )

        # Now we train the model
        for epoch in range(num_epochs):
            model.train()
            for step, batch in enumerate(train_dataloader):
                # And perform gradient accumulation
                with accelerator.accumulate(model):
                    # We could avoid this line since we set the accelerator with `device_placement=True`.
                    batch.to(accelerator.device)
                    outputs = model(**batch)
                    loss = outputs.loss
                    accelerator.backward(loss)
                    optimizer.step()
                    lr_scheduler.step()
                    optimizer.zero_grad()

            model.eval()
            for step, batch in enumerate(eval_dataloader):
                # We could avoid this line since we set the accelerator with `device_placement=True`.
                batch.to(accelerator.device)
                with torch.no_grad():
                    outputs = model(**batch)
                predictions = outputs.logits.argmax(dim=-1)
                predictions, references = accelerator.gather_for_metrics((predictions, batch["labels"]))
                metric.add_batch(
                    predictions=predictions,
                    references=references,
                )

            eval_metric = metric.compute()
            # Use accelerator.print to print only on the main process.
            accelerator.print(f"epoch {epoch}:", eval_metric)

    # New Code #
    # And call it at the end with no arguments
    # Note: You could also refactor this outside of your training loop function
    inner_training_loop()
    accelerator.end_training()


def main():
    parser = argparse.ArgumentParser(description="Simple example of training script.")
    parser.add_argument(
        "--mixed_precision",
        type=str,
        default=None,
        choices=["no", "fp16", "bf16", "fp8"],
        help="Whether to use mixed precision. Choose"
        "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
        "and an Nvidia Ampere GPU.",
    )
    parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
    args = parser.parse_args()
    # New Code #
    # We modify the starting batch size to be an observed batch size of 256, to guarantee an initial device OOM
    config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 256}
    training_function(config, args)


if __name__ == "__main__":
    main()


================================================
FILE: examples/by_feature/checkpointing.py
================================================
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os

import evaluate
import torch
from datasets import load_dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup

from accelerate import Accelerator, DataLoaderConfiguration, DistributedType
from accelerate.utils import set_seed


########################################################################
# This is a fully working simple example to use Accelerate,
# specifically showcasing the checkpointing capability,
# and builds off the `nlp_example.py` script.
#
# This example trains a Bert base model on GLUE MRPC
# in any of the following settings (with the same script):
#   - single CPU or single GPU
#   - multi GPUS (using PyTorch distributed mode)
#   - (multi) TPUs
#   - fp16 (mixed-precision) or fp32 (normal precision)
#
# To help focus on the differences in the code, building `DataLoaders`
# was refactored into its own function.
# New additions from the base script can be found quickly by
# looking for the # New Code # tags
#
# To run it in each of these various modes, follow the instructions
# in the readme for examples:
# https://github.com/huggingface/accelerate/tree/main/examples
#
########################################################################

MAX_GPU_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 32


def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):
    """
    Creates a set of `DataLoader`s for the `glue` dataset,
    using "bert-base-cased" as the tokenizer.

    Args:
        accelerator (`Accelerator`):
            An `Accelerator` object
        batch_size (`int`, *optional*):
            The batch size for the train and validation DataLoaders.
    """
    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    datasets = load_dataset("glue", "mrpc")

    def tokenize_function(examples):
        # max_length=None => use the model max length (it's actually the default)
        outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
        return outputs

    # Apply the method we just defined to all the examples in all the splits of the dataset
    # starting with the main process first:
    with accelerator.main_process_first():
        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            remove_columns=["idx", "sentence1", "sentence2"],
        )

    # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
    # transformers library
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

    def collate_fn(examples):
        # On TPU it's best to pad everything to the same length or training will be very slow.
        max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
        # When using mixed precision we want round multiples of 8/16
        if accelerator.mixed_precision == "fp8":
            pad_to_multiple_of = 16
        elif accelerator.mixed_precision != "no":
            pad_to_multiple_of = 8
        else:
            pad_to_multiple_of = None

        return tokenizer.pad(
            examples,
            padding="longest",
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors="pt",
        )

    # Instantiate dataloaders.
    train_dataloader = DataLoader(
        tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size
    )
    eval_dataloader = DataLoader(
        tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
    )

    return train_dataloader, eval_dataloader


# For testing only
if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
    from accelerate.test_utils.training import mocked_dataloaders

    get_dataloaders = mocked_dataloaders  # noqa: F811


def training_function(config, args):
    # For testing only
    if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
        config["num_epochs"] = 2
    # Initialize accelerator
    dataloader_config = DataLoaderConfiguration(use_stateful_dataloader=args.use_stateful_dataloader)
    accelerator = Accelerator(cpu=args.cpu, mixed_precision=args.mixed_precision, dataloader_config=dataloader_config)
    # Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
    lr = config["lr"]
    num_epochs = int(config["num_epochs"])
    seed = int(config["seed"])
    batch_size = int(config["batch_size"])

    # New Code #
    # Parse out whether we are saving every epoch or after a certain number of batches
    if hasattr(args.checkpointing_steps, "isdigit"):
        if args.checkpointing_steps == "epoch":
            checkpointing_steps = args.checkpointing_steps
        elif args.checkpointing_steps.isdigit():
            checkpointing_steps = int(args.checkpointing_steps)
        else:
            raise ValueError(
                f"Argument `checkpointing_steps` must be either a number or `epoch`. `{args.checkpointing_steps}` passed."
            )
    else:
        checkpointing_steps = None

    set_seed(seed)

    train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)
    metric = evaluate.load("glue", "mrpc")

    # If the batch size is too big we use gradient accumulation
    gradient_accumulation_steps = 1
    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.XLA:
        gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
        batch_size = MAX_GPU_BATCH_SIZE

    # Instantiate the model (we build the model here so that the seed also control new weights initialization)
    model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)

    # We could avoid this line since the accelerator is set with `device_placement=True` (default value).
    # Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
    # creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
    model = model.to(accelerator.device)

    # Instantiate optimizer
    optimizer = AdamW(params=model.parameters(), lr=lr)

    # Instantiate scheduler
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=100,
        num_training_steps=(len(train_dataloader) * num_epochs) // gradient_accumulation_steps,
    )

    # Prepare everything
    # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
    # prepare method.
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
    )

    # New Code #
    # We need to keep track of how many total steps we have iterated over
    overall_step = 0
    # We also need to keep track of the stating epoch so files are named properly
    starting_epoch = 0

    # We need to load the checkpoint back in before training here with `load_state`
    # The total number of epochs is adjusted based on where the state is being loaded from,
    # as we assume continuation of the same training script
    if args.resume_from_checkpoint:
        if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
            accelerator.load_state(args.resume_from_checkpoint)
            path = os.path.basename(args.resume_from_checkpoint)
        else:
            # Get the most recent checkpoint
            dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
            dirs.sort(key=os.path.getctime)
            path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
        # Extract `epoch_{i}` or `step_{i}`
        training_difference = os.path.splitext(path)[0]

        if "epoch" in training_difference:
            starting_epoch = int(training_difference.replace("epoch_", "")) + 1
            resume_step = None
        else:
            resume_step = int(training_difference.replace("step_", ""))
            starting_epoch = resume_step // len(train_dataloader)
            resume_step -= starting_epoch * len(train_dataloader)

    # Now we train the model
    for epoch in range(starting_epoch, num_epochs):
        model.train()
        # New Code #
        if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
            # We need to skip steps until we reach the resumed step only if we are not using a stateful dataloader
            if not args.use_stateful_dataloader:
                active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
            else:
                active_dataloader = train_dataloader
            overall_step += resume_step
        else:
            # After the first iteration though, we need to go back to the original dataloader
            active_dataloader = train_dataloader
        for step, batch in enumerate(active_dataloader):
            # We could avoid this line since we set the accelerator with `device_placement=True`.
            batch.to(accelerator.device)
            outputs = model(**batch)
            loss = outputs.loss
            loss = loss / gradient_accumulation_steps
            accelerator.backward(loss)
            if step % gradient_accumulation_steps == 0:
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
            # New Code #
            overall_step += 1

            # New Code #
            # We save the model, optimizer, lr_scheduler, and seed states by calling `save_state`
            # These are saved to folders named `step_{overall_step}`
            # Will contain files: "pytorch_model.bin", "optimizer.bin", "scheduler.bin", and "random_states.pkl"
            # If mixed precision was used, will also save a "scalar.bin" file
            if isinstance(checkpointing_steps, int):
                output_dir = f"step_{overall_step}"
                if overall_step % checkpointing_steps == 0:
                    if args.output_dir is not None:
                        output_dir = os.path.join(args.output_dir, output_dir)
                    accelerator.save_state(output_dir)
        model.eval()
        for step, batch in enumerate(eval_dataloader):
            # We could avoid this line since we set the accelerator with `device_placement=True` (the default).
            batch.to(accelerator.device)
            with torch.no_grad():
                outputs = model(**batch)
            predictions = outputs.logits.argmax(dim=-1)
            predictions, references = accelerator.gather_for_metrics((predictions, batch["labels"]))
            metric.add_batch(
                predictions=predictions,
                references=references,
            )
        eval_metric = metric.compute()
        # Use accelerator.print to print only on the main process.
        accelerator.print(f"epoch {epoch}:", eval_metric)

        # New Code #
        # We save the model, optimizer, lr_scheduler, and seed states by calling `save_state`
        # These are saved to folders named `epoch_{epoch}`
        # Will contain files: "pytorch_model.bin", "optimizer.bin", "scheduler.bin", and "random_states.pkl"
        # If mixed precision was used, will also save a "scalar.bin" file
        if checkpointing_steps == "epoch":
            output_dir = f"epoch_{epoch}"
            if args.output_dir is not None:
                output_dir = os.path.join(args.output_dir, output_dir)
            accelerator.save_state(output_dir)
    accelerator.end_training()


def main():
    parser = argparse.ArgumentParser(description="Simple example of training script.")
    parser.add_argument(
        "--mixed_precision",
        type=str,
        default=None,
        choices=["no", "fp16", "bf16", "fp8"],
        help="Whether to use mixed precision. Choose"
        "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
        "and an Nvidia Ampere GPU.",
    )
    parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
    parser.add_argument(
        "--checkpointing_steps",
        type=str,
        default=None,
        help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
    )
    parser.add_argument(
        "--output_dir",
        type=str,
        default=".",
        help="Optional save directory where all checkpoint folders will be stored. Default is the current working directory.",
    )
    parser.add_argument(
        "--resume_from_checkpoint",
        type=str,
        default=None,
        help="If the training should continue from a checkpoint folder.",
    )
    parser.add_argument(
        "--use_stateful_dataloader",
        action="store_true",
        help="If the dataloader should be a resumable stateful dataloader.",
    )
    args = parser.parse_args()
    config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16}
    training_function(config, args)


if __name__ == "__main__":
    main()


================================================
FILE: examples/by_feature/cross_validation.py
================================================
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse

import evaluate
import numpy as np
import torch
from datasets import DatasetDict, load_dataset

# New Code #
# We'll be using StratifiedKFold for this example
from sklearn.model_selection import StratifiedKFold
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed

from accelerate import Accelerator, DistributedType


########################################################################
# This is a fully working simple example to use Accelerate,
# specifically showcasing how to perform Cross Validation,
# and builds off the `nlp_example.py` script.
#
# This example trains a Bert base model on GLUE MRPC
# in any of the following settings (with the same script):
#   - single CPU or single GPU
#   - multi GPUS (using PyTorch distributed mode)
#   - (multi) TPUs
#   - fp16 (mixed-precision) or fp32 (normal precision)
#
# To help focus on the differences in the code, building `DataLoaders`
# was refactored into its own function.
# New additions from the base script can be found quickly by
# looking for the # New Code # tags
#
# To run it in each of these various modes, follow the instructions
# in the readme for examples:
# https://github.com/huggingface/accelerate/tree/main/examples
#
########################################################################


MAX_GPU_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 32

# New Code #
# We need a different `get_dataloaders` function that will build dataloaders by index


def get_fold_dataloaders(
    accelerator: Accelerator, dataset: DatasetDict, train_idxs: list[int], valid_idxs: list[int], batch_size: int = 16
):
    """
    Gets a set of train, valid, and test dataloaders for a particular fold

    Args:
        accelerator (`Accelerator`):
            The main `Accelerator` object
        train_idxs (list of `int`):
            The split indices for the training dataset
        valid_idxs (list of `int`):
            The split indices for the validation dataset
        batch_size (`int`):
            The size of the minibatch. Default is 16
    """
    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    datasets = DatasetDict(
        {
            "train": dataset["train"].select(train_idxs),
            "validation": dataset["train"].select(valid_idxs),
            "test": dataset["validation"],
        }
    )

    def tokenize_function(examples):
        # max_length=None => use the model max length (it's actually the default)
        outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
        return outputs

    # Apply the method we just defined to all the examples in all the splits of the dataset
    # starting with the main process first:
    with accelerator.main_process_first():
        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            remove_columns=["idx", "sentence1", "sentence2"],
        )

    # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
    # transformers library
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

    def collate_fn(examples):
        # On TPU it's best to pad everything to the same length or training will be very slow.
        max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
        # When using mixed precision we want round multiples of 8/16
        if accelerator.mixed_precision == "fp8":
            pad_to_multiple_of = 16
        elif accelerator.mixed_precision != "no":
            pad_to_multiple_of = 8
        else:
            pad_to_multiple_of = None

        return tokenizer.pad(
            examples,
            padding="longest",
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors="pt",
        )

    # Instantiate dataloaders.
    train_dataloader = DataLoader(
        tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size
    )
    eval_dataloader = DataLoader(
        tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
    )

    test_dataloader = DataLoader(
        tokenized_datasets["test"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
    )

    return train_dataloader, eval_dataloader, test_dataloader


def training_function(config, args):
    # New Code #
    test_predictions = []
    # Download the dataset
    datasets = load_dataset("glue", "mrpc")
    # Create our splits
    kfold = StratifiedKFold(n_splits=int(args.num_folds))
    # Initialize accelerator
    accelerator = Accelerator(cpu=args.cpu, mixed_precision=args.mixed_precision)
    # Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
    lr = config["lr"]
    num_epochs = int(config["num_epochs"])
    seed = int(config["seed"])
    batch_size = int(config["batch_size"])

    metric = evaluate.load("glue", "mrpc")

    # If the batch size is too big we use gradient accumulation
    gradient_accumulation_steps = 1
    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.XLA:
        gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
        batch_size = MAX_GPU_BATCH_SIZE

    set_seed(seed)

    # New Code #
    # Create our folds:
    folds = kfold.split(np.zeros(datasets["train"].num_rows), datasets["train"]["label"])
    test_references = []
    # Iterate over them
    for i, (train_idxs, valid_idxs) in enumerate(folds):
        train_dataloader, eval_dataloader, test_dataloader = get_fold_dataloaders(
            accelerator,
            datasets,
            train_idxs,
            valid_idxs,
        )
        # Instantiate the model (we build the model here so that the seed also control new weights initialization)
        model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)

        # We could avoid this line since the accelerator is set with `device_placement=True` (default value).
        # Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
        # creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
        model = model.to(accelerator.device)

        # Instantiate optimizer
        optimizer = AdamW(params=model.parameters(), lr=lr)

        # Instantiate scheduler
        lr_scheduler = get_linear_schedule_with_warmup(
            optimizer=optimizer,
            num_warmup_steps=100,
            num_training_steps=(len(train_dataloader) * num_epochs) // gradient_accumulation_steps,
        )

        # Prepare everything
        # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
        # prepare method.
        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
            model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
        )

        # Now we train the model
        for epoch in range(num_epochs):
            model.train()
            for step, batch in enumerate(train_dataloader):
                # We could avoid this line since we set the accelerator with `device_placement=True`.
                batch.to(accelerator.device)
                outputs = model(**batch)
                loss = outputs.loss
                loss = loss / gradient_accumulation_steps
                accelerator.backward(loss)
                if step % gradient_accumulation_steps == 0:
                    optimizer.step()
                    lr_scheduler.step()
                    optimizer.zero_grad()

            model.eval()
            for step, batch in enumerate(eval_dataloader):
                # We could avoid this line since we set the accelerator with `device_placement=True`.
                batch.to(accelerator.device)
                with torch.no_grad():
                    outputs = model(**batch)
                predictions = outputs.logits.argmax(dim=-1)
                predictions, references = accelerator.gather_for_metrics((predictions, batch["labels"]))
                metric.add_batch(
                    predictions=predictions,
                    references=references,
                )

            eval_metric = metric.compute()
            # Use accelerator.print to print only on the main process.
            accelerator.print(f"epoch {epoch}:", eval_metric)

        # New Code #
        # We also run predictions on the test set at the very end
        fold_predictions = []
        for step, batch in enumerate(test_dataloader):
            # We could avoid this line since we set the accelerator with `device_placement=True`.
            batch.to(accelerator.device)
            with torch.no_grad():
                outputs = model(**batch)
            predictions = outputs.logits
            predictions, references = accelerator.gather_for_metrics((predictions, batch["labels"]))
            fold_predictions.append(predictions.cpu())
            if i == 0:
                # We need all of the test predictions
                test_references.append(references.cpu())
        # Use accelerator.print to print only on the main process.
        test_predictions.append(torch.cat(fold_predictions, dim=0))
        # We now need to release all our memory and get rid of the current model, optimizer, etc
        model, optimizer = accelerator.free_memory(model, optimizer)
    # New Code #
    # Finally we check the accuracy of our folded results:
    test_references = torch.cat(test_references, dim=0)
    preds = torch.stack(test_predictions, dim=0).sum(dim=0).div(int(args.num_folds)).argmax(dim=-1)
    test_metric = metric.compute(predictions=preds, references=test_references)
    accelerator.print("Average test metrics from all folds:", test_metric)
    accelerator.end_training()


def main():
    parser = argparse.ArgumentParser(description="Simple example of training script.")
    parser.add_argument(
        "--mixed_precision",
        type=str,
        default=None,
        choices=["no", "fp16", "bf16", "fp8"],
        help="Whether to use mixed precision. Choose"
        "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
        "and an Nvidia Ampere GPU.",
    )
    parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
    # New Code #
    parser.add_argument("--num_folds", type=int, default=3, help="The number of splits to perform across the dataset")
    args = parser.parse_args()
    config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16}
    training_function(config, args)


if __name__ == "__main__":
    main()


================================================
FILE: examples/by_feature/ddp_comm_hook.py
================================================
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os

import evaluate
import torch
from datasets import load_dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed

from accelerate import Accelerator, DistributedType
from accelerate.utils import DDPCommunicationHookType, DistributedDataParallelKwargs


########################################################################
# This is a fully working simple example to use Accelerate
# and perform ddp communication hook
#
# This example trains a Bert base model on GLUE MRPC
# in any of the following settings (with the same script):
#   - single CPU or single GPU
#   - multi GPUS (using PyTorch distributed mode)
#   - (multi) TPUs
#   - fp16 (mixed-precision) or fp32 (normal precision)
#
# To run it in each of these various modes, follow the instructions
# in the readme for examples:
# https://github.com/huggingface/accelerate/tree/main/examples
#
########################################################################


MAX_GPU_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 32


def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):
    """
    Creates a set of `DataLoader`s for the `glue` dataset,
    using "bert-base-cased" as the tokenizer.

    Args:
        accelerator (`Accelerator`):
            An `Accelerator` object
        batch_size (`int`, *optional*):
            The batch size for the train and validation DataLoaders.
    """
    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    datasets = load_dataset("glue", "mrpc")

    def tokenize_function(examples):
        # max_length=None => use the model max length (it's actually the default)
        outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
        return outputs

    # Apply the method we just defined to all the examples in all the splits of the dataset
    # starting with the main process first:
    with accelerator.main_process_first():
        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            remove_columns=["idx", "sentence1", "sentence2"],
        )

    # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
    # transformers library
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

    def collate_fn(examples):
        # On TPU it's best to pad everything to the same length or training will be very slow.
        max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
        # When using mixed precision we want round multiples of 8/16
        if accelerator.mixed_precision == "fp8":
            pad_to_multiple_of = 16
        elif accelerator.mixed_precision != "no":
            pad_to_multiple_of = 8
        else:
            pad_to_multiple_of = None

        return tokenizer.pad(
            examples,
            padding="longest",
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors="pt",
        )

    # Instantiate dataloaders.
    train_dataloader = DataLoader(
        tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size
    )
    eval_dataloader = DataLoader(
        tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
    )

    return train_dataloader, eval_dataloader


# For testing only
if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
    from accelerate.test_utils.training import mocked_dataloaders

    get_dataloaders = mocked_dataloaders  # noqa: F811


def training_function(config, args):
    # For testing only
    if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
        config["num_epochs"] = 2
    # New Code #
    ddp_comm_hook_type = DDPCommunicationHookType(args.ddp_comm_hook)
    ddp_comm_wrapper = DDPCommunicationHookType(args.ddp_comm_wrapper)
    ddp_kwargs = DistributedDataParallelKwargs(comm_hook=ddp_comm_hook_type, comm_wrapper=ddp_comm_wrapper)
    # Initialize accelerator
    accelerator = Accelerator(cpu=args.cpu, mixed_precision=args.mixed_precision, kwargs_handlers=[ddp_kwargs])
    # Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
    lr = config["lr"]
    num_epochs = int(config["num_epochs"])
    seed = int(config["seed"])
    batch_size = int(config["batch_size"])

    metric = evaluate.load("glue", "mrpc")

    set_seed(seed)
    train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)
    # Instantiate the model (we build the model here so that the seed also control new weights initialization)
    model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)

    # We could avoid this line since the accelerator is set with `device_placement=True` (default value).
    # Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
    # creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
    model = model.to(accelerator.device)

    # Instantiate optimizer
    optimizer = AdamW(params=model.parameters(), lr=lr)

    # Instantiate scheduler
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=100,
        num_training_steps=(len(train_dataloader) * num_epochs),
    )

    # Prepare everything
    # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
    # prepare method.
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
    )

    # Now we train the model
    for epoch in range(num_epochs):
        model.train()
        for step, batch in enumerate(train_dataloader):
            # We could avoid this line since we set the accelerator with `device_placement=True`.
            batch.to(accelerator.device)
            # We use the new `accumulate` context manager to perform gradient accumulation
            with accelerator.accumulate(model):
                output = model(**batch)
                loss = output.loss
                accelerator.backward(loss)
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()

        model.eval()
        for step, batch in enumerate(eval_dataloader):
            # We could avoid this line since we set the accelerator with `device_placement=True`.
            batch.to(accelerator.device)
            with torch.no_grad():
                outputs = model(**batch)
            predictions = outputs.logits.argmax(dim=-1)
            predictions, references = accelerator.gather_for_metrics((predictions, batch["labels"]))
            metric.add_batch(
                predictions=predictions,
                references=references,
            )

        eval_metric = metric.compute()
        # Use accelerator.print to print only on the main process.
        accelerator.print(f"epoch {epoch}:", eval_metric)
    accelerator.end_training()


def main():
    parser = argparse.ArgumentParser(description="Simple example of training script.")
    parser.add_argument(
        "--mixed_precision",
        type=str,
        default=None,
        choices=["no", "fp16", "bf16", "fp8"],
        help="Whether to use mixed precision. Choose"
        "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
        "and an Nvidia Ampere GPU.",
    )
    # New Code #
    parser.add_argument(
        "--ddp_comm_hook",
        type=str,
        default="no",
        choices=["no", "fp16", "bf16", "power_sgd", "batched_power_sgd"],
        help="DDP Communication hook to use. Choose between `no`, `fp16`, `bf16`, `power_sgd`, and `batched_power_sgd`.",
    )
    # New Code #
    parser.add_argument(
        "--ddp_comm_wrapper",
        type=str,
        default="no",
        choices=["no", "fp16", "bf16"],
        help="DDP Communication wrapper to use. Choose between `no`, `fp16`, and `bf16`.",
    )
    parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
    args = parser.parse_args()
    config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16}
    training_function(config, args)


if __name__ == "__main__":
    main()


================================================
FILE: examples/by_feature/deepspeed_with_config_support.py
================================================
#!/usr/bin/env python
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...)
on a text file or a dataset without using HuggingFace Trainer.

Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
https://huggingface.co/models?filter=text-generation
"""
# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.

import argparse
import json
import logging
import math
import os
import random
from itertools import chain
from pathlib import Path

import datasets
import torch
import transformers
from datasets import load_dataset
from huggingface_hub import HfApi
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from transformers import (
    CONFIG_MAPPING,
    MODEL_MAPPING,
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    SchedulerType,
    default_data_collator,
    get_scheduler,
)
from transformers.utils.versions import require_version

from accelerate import Accelerator, DistributedType
from accelerate.logging import get_logger
from accelerate.utils import DummyOptim, DummyScheduler, set_seed


logger = get_logger(__name__)

require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)


def parse_args():
    parser = argparse.ArgumentParser(description="Finetune a transformers model on a causal language modeling task")
    parser.add_argument(
        "--dataset_name",
        type=str,
        default=None,
        help="The name of the dataset to use (via the datasets library).",
    )
    parser.add_argument(
        "--dataset_config_name",
        type=str,
        default=None,
        help="The configuration name of the dataset to use (via the datasets library).",
    )
    parser.add_argument(
        "--train_file", type=str, default=None, help="A csv or a json file containing the training data."
    )
    parser.add_argument(
        "--validation_file", type=str, default=None, help="A csv or a json file containing the validation data."
    )
    parser.add_argument(
        "--validation_split_percentage",
        default=5,
        help="The percentage of the train set used as validation set in case there's no validation split",
    )
    parser.add_argument(
        "--model_name_or_path",
        type=str,
        help="Path to pretrained model or model identifier from huggingface.co/models.",
        required=False,
    )
    parser.add_argument(
        "--config_name",
        type=str,
        default=None,
        help="Pretrained config name or path if not the same as model_name",
    )
    parser.add_argument(
        "--tokenizer_name",
        type=str,
        default=None,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )
    parser.add_argument(
        "--use_slow_tokenizer",
        action="store_true",
        help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
    )
    parser.add_argument(
        "--per_device_train_batch_size",
        type=int,
        default=8,
        help="Batch size (per device) for the training dataloader.",
    )
    parser.add_argument(
        "--per_device_eval_batch_size",
        type=int,
        default=8,
        help="Batch size (per device) for the evaluation dataloader.",
    )
    parser.add_argument(
        "--learning_rate",
        type=float,
        default=5e-5,
        help="Initial learning rate (after the potential warmup period) to use.",
    )
    parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
    parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_train_steps",
        type=int,
        default=None,
        help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
    )
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument(
        "--lr_scheduler_type",
        type=SchedulerType,
        default="linear",
        help="The scheduler type to use.",
        choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"],
    )
    parser.add_argument(
        "--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler."
    )
    parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.")
    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
    parser.add_argument(
        "--model_type",
        type=str,
        default=None,
        help="Model type to use if training from scratch.",
        choices=MODEL_TYPES,
    )
    parser.add_argument(
        "--block_size",
        type=int,
        default=None,
        help=(
            "Optional input sequence length after tokenization. The training dataset will be truncated in block of"
            " this size for training. Default to the model max input length for single sentence inputs (take into"
            " account special tokens)."
        ),
    )
    parser.add_argument(
        "--preprocessing_num_workers",
        type=int,
        default=None,
        help="The number of processes to use for the preprocessing.",
    )
    parser.add_argument(
        "--overwrite_cache", type=bool, default=False, help="Overwrite the cached training and evaluation sets"
    )
    parser.add_argument(
        "--no_keep_linebreaks", action="store_true", help="Do not keep line breaks when using TXT files."
    )
    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
    parser.add_argument(
        "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
    )
    parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
    parser.add_argument(
        "--checkpointing_steps",
        type=str,
        default=None,
        help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
    )
    parser.add_argument(
        "--resume_from_checkpoint",
        type=str,
        default=None,
        help="If the training should continue from a checkpoint folder.",
    )
    # New Code #
    # Whether to load the best model at the end of training
    parser.add_argument(
        "--load_best_model",
        action="store_true",
        help="Whether to load the best model at the end of training",
    )
    parser.add_argument(
        "--with_tracking",
        action="store_true",
        help="Whether to enable experiment trackers for logging.",
    )
    parser.add_argument(
        "--report_to",
        type=str,
        default="all",
        help=(
            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
            ' `"wandb"`, `"comet_ml"`, `"dvclive"`, and `"swanlab"`. Use `"all"` (default) to report to all integrations.'
            "Only applicable when `--with_tracking` is passed."
        ),
    )
    args = parser.parse_args()

    # Sanity checks
    if args.dataset_name is None and args.train_file is None and args.validation_file is None:
        raise ValueError("Need either a dataset name or a training/validation file.")
    else:
        if args.train_file is not None:
            extension = args.train_file.split(".")[-1]
            assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, json or txt file."
        if args.validation_file is not None:
            extension = args.validation_file.split(".")[-1]
            assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, json or txt file."

    if args.push_to_hub:
        assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed."

    return args


# New Code #
def evaluate(args, model, eval_dataloader, accelerator, eval_dataset):
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs.loss
        losses.append(accelerator.gather_for_metrics(loss.repeat(args.per_device_eval_batch_size)))

    losses = torch.cat(losses)
    try:
        eval_loss = torch.mean(losses)
        perplexity = math.exp(eval_loss)
    except OverflowError:
        perplexity = float("inf")
    return perplexity, eval_loss


def main():
    args = parse_args()

    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
    # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
    # in the environment

    # when using DeepSpeed, the `gradient_accumulation_steps` is properly set from the DeepSpeed plugin/config
    # or from `accelerate launch` via `--gradient_accumulation_steps`  else
    # defaulting to the passed `args.gradient_accumulation_steps`
    accelerator = (
        Accelerator(
            log_with=args.report_to,
            project_dir=args.output_dir,
            gradient_accumulation_steps=args.gradient_accumulation_steps,
        )
        if args.with_tracking
        else Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps)
    )

    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.info(accelerator.state, main_process_only=False)
    if accelerator.is_local_main_process:
        datasets.utils.logging.set_verbosity_warning()
        transformers.utils.logging.set_verbosity_info()
    else:
        datasets.utils.logging.set_verbosity_error()
        transformers.utils.logging.set_verbosity_error()

    # If passed along, set the training seed now.
    if args.seed is not None:
        set_seed(args.seed)

    # Handle the repository creation
    if accelerator.is_main_process:
        if args.push_to_hub:
            api = HfApi(token=args.hub_token)

            # Create repo (repo_name from args or inferred)
            repo_name = args.hub_model_id
            if repo_name is None:
                repo_name = Path(args.output_dir).absolute().name
            repo_id = api.create_repo(repo_name, exist_ok=True).repo_id

            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                if "step_*" not in gitignore:
                    gitignore.write("step_*\n")
                if "epoch_*" not in gitignore:
                    gitignore.write("epoch_*\n")
        elif args.output_dir is not None:
            os.makedirs(args.output_dir, exist_ok=True)
    accelerator.wait_for_everyone()

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
        if "validation" not in raw_datasets.keys():
            raw_datasets["validation"] = load_dataset(
                args.dataset_name,
                args.dataset_config_name,
                split=f"train[:{args.validation_split_percentage}%]",
            )
            raw_datasets["train"] = load_dataset(
                args.dataset_name,
                args.dataset_config_name,
                split=f"train[{args.validation_split_percentage}%:]",
            )
    else:
        data_files = {}
        dataset_args = {}
        if args.train_file is not None:
            data_files["train"] = args.train_file
        if args.validation_file is not None:
            data_files["validation"] = args.validation_file
        extension = args.train_file.split(".")[-1]
        if extension == "txt":
            extension = "text"
            dataset_args["keep_linebreaks"] = not args.no_keep_linebreaks
        raw_datasets = load_dataset(extension, data_files=data_files, **dataset_args)
        # If no validation data is there, validation_split_percentage will be used to divide the dataset.
        if "validation" not in raw_datasets.keys():
            raw_datasets["validation"] = load_dataset(
                extension,
                data_files=data_files,
                split=f"train[:{args.validation_split_percentage}%]",
                **dataset_args,
            )
            raw_datasets["train"] = load_dataset(
                extension,
                data_files=data_files,
                split=f"train[{args.validation_split_percentage}%:]",
                **dataset_args,
            )

    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Load pretrained model and tokenizer
    #
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    if args.config_name:
        config = AutoConfig.from_pretrained(args.config_name)
    elif args.model_name_or_path:
        config = AutoConfig.from_pretrained(args.model_name_or_path)
    else:
        config = CONFIG_MAPPING[args.model_type]()
        logger.warning("You are instantiating a new config instance from scratch.")

    if args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer)
    elif args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    if args.model_name_or_path:
        model = AutoModelForCausalLM.from_pretrained(
            args.model_name_or_path,
            from_tf=bool(".ckpt" in args.model_name_or_path),
            config=config,
        )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelForCausalLM.from_config(config)

    model.resize_token_embeddings(len(tokenizer))

    # Preprocessing the datasets.
    # First we tokenize all the texts.
    column_names = raw_datasets["train"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

    def tokenize_function(examples):
        return tokenizer(examples[text_column_name])

    with accelerator.main_process_first():
        tokenized_datasets = raw_datasets.map(
            tokenize_function,
            batched=True,
            num_proc=args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not args.overwrite_cache,
            desc="Running tokenizer on dataset",
        )

    if args.block_size is None:
        block_size = tokenizer.model_max_length
        if block_size > 1024:
            logger.warning(
                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
                "Picking 1024 instead. You can change that default value by passing --block_size xxx."
            )
        block_size = 1024
    else:
        if args.block_size > tokenizer.model_max_length:
            logger.warning(
                f"The block_size passed ({args.block_size}) is larger than the maximum length for the model"
                f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
            )
        block_size = min(args.block_size, tokenizer.model_max_length)

    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
    def group_texts(examples):
        # Concatenate all texts.
        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
        if total_length >= block_size:
            total_length = (total_length // block_size) * block_size
        # Split by chunks of max_len.
        result = {
            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
            for k, t in concatenated_examples.items()
        }
        result["labels"] = result["input_ids"].copy()
        return result

    # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
    # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
    # to preprocess.
    #
    # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
    # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map

    with accelerator.main_process_first():
        lm_datasets = tokenized_datasets.map(
            group_texts,
            batched=True,
            num_proc=args.preprocessing_num_workers,
            load_from_cache_file=not args.overwrite_cache,
            desc=f"Grouping texts in chunks of {block_size}",
        )

    train_dataset = lm_datasets["train"]
    eval_dataset = lm_datasets["validation"]

    # Log a few random samples from the training set:
    for index in random.sample(range(len(train_dataset)), 3):
        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")

    # DataLoaders creation:
    train_dataloader = DataLoader(
        train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=args.per_device_train_batch_size
    )
    eval_dataloader = DataLoader(
        eval_dataset, collate_fn=default_data_collator, batch_size=args.per_device_eval_batch_size
    )

    # Optimizer
    # Split weights in two groups, one with weight decay and the other not.
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    # New Code #
    # Creates Dummy Optimizer if `optimizer` was specified in the config file else creates Adam Optimizer
    optimizer_cls = (
        torch.optim.AdamW
        if accelerator.state.deepspeed_plugin is None
        or "optimizer" not in accelerator.state.deepspeed_plugin.deepspeed_config
        else DummyOptim
    )
    optimizer = optimizer_cls(optimizer_grouped_parameters, lr=args.learning_rate)

    # On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
    if accelerator.distributed_type == DistributedType.XLA:
        model.tie_weights()

    # Scheduler and math around the number of training steps.
    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / accelerator.gradient_accumulation_steps)
    overrode_max_train_steps = False
    if args.max_train_steps is None:
        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
        overrode_max_train_steps = True
    else:
        args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)

    # New Code #
    # Creates Dummy Scheduler if `scheduler` was specified in the config file else creates `args.lr_scheduler_type` Scheduler
    if (
        accelerator.state.deepspeed_plugin is None
        or "scheduler" not in accelerator.state.deepspeed_plugin.deepspeed_config
    ):
        lr_scheduler = get_scheduler(
            name=args.lr_scheduler_type,
            optimizer=optimizer,
            num_warmup_steps=args.num_warmup_steps,
            num_training_steps=args.max_train_steps,
        )
    else:
        lr_scheduler = DummyScheduler(
            optimizer, total_num_steps=args.max_train_steps, warmup_num_steps=args.num_warmup_steps
        )

    # Prepare everything with our `accelerator`.
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
    )

    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / accelerator.gradient_accumulation_steps)
    if overrode_max_train_steps:
        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
    # Afterwards we recalculate our number of training epochs
    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)

    # Figure out how many steps we should save the Accelerator states
    checkpointing_steps = args.checkpointing_steps
    if checkpointing_steps is not None and checkpointing_steps.isdigit():
        checkpointing_steps = int(checkpointing_steps)

    # We need to initialize the trackers we use, and also store our configuration.
    # The trackers initializes automatically on the main process.
    if args.with_tracking:
        experiment_config = vars(args)
        # TensorBoard cannot log Enums, need the raw value
        experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value
        accelerator.init_trackers("clm_no_trainer", experiment_config)

    # Train!
    total_batch_size = (
        args.per_device_train_batch_size * accelerator.num_processes * accelerator.gradient_accumulation_steps
    )

    logger.info("***** Running training *****")
    logger.info(f"  Num examples = {len(train_dataset)}")
    logger.info(f"  Num Epochs = {args.num_train_epochs}")
    logger.info(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
    logger.info(f"  Gradient Accumulation steps = {accelerator.gradient_accumulation_steps}")
    logger.info(f"  Total optimization steps = {args.max_train_steps}")
    # Only show the progress bar once on each machine.
    progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
    completed_steps = 0
    starting_epoch = 0
    best_metric = None
    best_metric_checkpoint = None

    # Potentially load in the weights and states from a previous save
    if args.resume_from_checkpoint:
        accelerator.load_state(args.resume_from_checkpoint)
        accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
        path = os.path.basename(args.resume_from_checkpoint)
        training_difference = os.path.splitext(path)[0]

        if "epoch" in training_difference:
            starting_epoch = int(training_difference.replace("epoch_", "")) + 1
            resume_step = None
            completed_steps = starting_epoch * num_update_steps_per_epoch
        else:
            resume_step = int(training_difference.replace("step_", ""))
            starting_epoch = resume_step // num_update_steps_per_epoch
            resume_step -= starting_epoch * num_update_steps_per_epoch
            completed_steps = resume_step

    # update progress bar if resumed from checkpoint
    progress_bar.update(completed_steps)

    for epoch in range(starting_epoch, args.num_train_epochs):
        model.train()
        if args.with_tracking:
            total_loss = 0

        # skip new `skip_first_batches` to skip the batches when resuming from ckpt
        if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
            # We need to skip steps until we reach the resumed step
            active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
        else:
            # After the first iteration though, we need to go back to the original dataloader
            active_dataloader = train_dataloader
        for step, batch in enumerate(active_dataloader):
            # In particular, DeepSpeed handles `gradient_accumulation` via `DeepSpeedEngine`.
            # Below, we use `accelerator.accumulate` if the user
            # wants to switch to other approaches such as plain DDP, PyTorch FSDP ...
            # This avoids having to change any code as things are all handled across different distributed setups.
            with accelerator.accumulate(model):
                outputs = model(**batch)
                loss = outputs.loss
                accelerator.backward(loss)
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()

                if accelerator.sync_gradients:
                    progress_bar.update(1)
                    completed_steps += 1

            # We keep track of the loss at each epoch
            if args.with_tracking:
                step_loss = accelerator.reduce(loss.detach().clone()).item()
                total_loss += step_loss

            if isinstance(checkpointing_steps, int):
                if completed_steps % checkpointing_steps == 0:
                    output_dir = f"step_{completed_steps}"
                    if args.output_dir is not None:
                        output_dir = os.path.join(args.output_dir, output_dir)
                    accelerator.save_state(output_dir)
            if completed_steps >= args.max_train_steps:
                break

        perplexity, eval_loss = evaluate(args, model, eval_dataloader, accelerator, eval_dataset)
        logger.info(f"epoch {epoch}: perplexity: {perplexity} eval_loss: {eval_loss}")

        if args.with_tracking:
            accelerator.log(
                {
                    "perplexity": perplexity,
                    "eval_loss": eval_loss,
                    "train_loss": total_loss / len(train_dataloader),
                    "epoch": epoch,
                    "step": completed_steps,
                },
                step=completed_steps,
            )

        if isinstance(checkpointing_steps, str) and checkpointing_steps == "epoch":
            accelerator.save_state(os.path.join(args.output_dir, f"epoch_{epoch}"))

        # New Code #
        # Tracks the best checkpoint and best metric
        if best_metric is None or best_metric > perplexity:
            best_metric = perplexity
            best_metric_checkpoint = os.path.join(args.output_dir, "best_checkpoint")
            accelerator.save_state(best_metric_checkpoint)
            accelerator.print(f"New best metric: {best_metric} at epoch {epoch}")
            accelerator.print(f"best_metric_checkpoint: {best_metric_checkpoint}")

    # New Code #
    # Loads the best checkpoint after the training is finished
    if args.load_best_model:
        accelerator.load_state(best_metric_checkpoint)

    # New Code #
    # Evaluates using the best checkpoint
    perplexity, eval_loss = evaluate(args, model, eval_dataloader, accelerator, eval_dataset)
    logger.info(f"Best model metrics: perplexity: {perplexity} eval_loss: {eval_loss}")
    if perplexity != best_metric:
        raise AssertionError(
            f"Best metric {best_metric} does not match the metric {perplexity} of the loaded best model."
        )

    if args.output_dir is not None:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)

        # New Code #
        # Saves the whole/unpartitioned fp16 model when in ZeRO Stage-3 to the output directory if
        # `stage3_gather_16bit_weights_on_model_save` is True in DeepSpeed Config file or
        # `zero3_save_16bit_model` is True in DeepSpeed Plugin.
        # For Zero Stages 1 and 2, models are saved as usual in the output directory.
        # The model name saved is `pytorch_model.bin`
        unwrapped_model.save_pretrained(
            args.output_dir,
            is_main_process=accelerator.is_main_process,
            save_function=accelerator.save,
            state_dict=accelerator.get_state_dict(model),
        )
        if accelerator.is_main_process:
            tokenizer.save_pretrained(args.output_dir)
            if args.push_to_hub:
                api.upload_folder(
                    repo_id=repo_id,
                    folder_path=args.output_dir,
                    commit_message="End of training",
                )

        with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
            json.dump({"perplexity": perplexity, "eval_loss": eval_loss.item()}, f)
    accelerator.end_training()


if __name__ == "__main__":
    main()


================================================
FILE: examples/by_feature/early_stopping.py
================================================
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse

import evaluate
import torch
from datasets import load_dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed

from accelerate import Accelerator, DistributedType


########################################################################
# This is a fully working simple example to use Accelerate
# specifically showcasing how to perform early stopping,
# and builds off the `nlp_example.py` script
#
# This example trains a Bert base model on GLUE MRPC
# in any of the following settings (with the same script):
#   - single CPU or single GPU
#   - multi GPUS (using PyTorch distributed mode)
#   - (multi) TPUs
#   - fp16 (mixed-precision) or fp32 (normal precision)
#
# To run it in each of these various modes, follow the instructions
# in the readme for examples:
# https://github.com/huggingface/accelerate/tree/main/examples
#
########################################################################


MAX_GPU_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 32


def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):
    """
    Creates a set of `DataLoader`s for the `glue` dataset,
    using "bert-base-cased" as the tokenizer.

    Args:
        accelerator (`Accelerator`):
            An `Accelerator` object
        batch_size (`int`, *optional*):
            The batch size for the train and validation DataLoaders.
    """
    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    datasets = load_dataset("glue", "mrpc")

    def tokenize_function(examples):
        # max_length=None => use the model max length (it's actually the default)
        outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
        return outputs

    # Apply the method we just defined to all the examples in all the splits of the dataset
    # starting with the main process first:
    with accelerator.main_process_first():
        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            remove_columns=["idx", "sentence1", "sentence2"],
        )

    # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
    # transformers library
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

    def collate_fn(examples):
        # On TPU it's best to pad everything to the same length or training will be very slow.
        max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
        # When using mixed precision we want round multiples of 8/16
        if accelerator.mixed_precision == "fp8":
            pad_to_multiple_of = 16
        elif accelerator.mixed_precision != "no":
            pad_to_multiple_of = 8
        else:
            pad_to_multiple_of = None

        return tokenizer.pad(
            examples,
            padding="longest",
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors="pt",
        )

    # Instantiate dataloaders.
    train_dataloader = DataLoader(
        tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size, drop_last=True
    )
    eval_dataloader = DataLoader(
        tokenized_datasets["validation"],
        shuffle=False,
        collate_fn=collate_fn,
        batch_size=EVAL_BATCH_SIZE,
        drop_last=(accelerator.mixed_precision == "fp8"),
    )

    return train_dataloader, eval_dataloader


# New code
class EarlyStoppingCallback:
    "A callback class that helps with early stopping"

    def __init__(self, min_delta=0, patience=5):
        self.min_delta = min_delta
        self.patience = patience
        self.counter = 0
        self.lowest_loss = float("inf")

    def check_early_stopping(self, eval_loss):
        delta = self.lowest_loss - eval_loss
        if delta >= self.min_delta:
            self.lowest_loss = eval_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False


callback = EarlyStoppingCallback()


def training_function(config, args):
    # Initialize accelerator
    accelerator = Accelerator(cpu=args.cpu, mixed_precision=args.mixed_precision)
    # Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
    lr = config["lr"]
    num_epochs = int(config["num_epochs"])
    seed = int(config["seed"])
    batch_size = int(config["batch_size"])

    metric = evaluate.load("glue", "mrpc")

    # If the batch size is too big we use gradient accumulation
    gradient_accumulation_steps = 1
    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.XLA:
        gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
        batch_size = MAX_GPU_BATCH_SIZE

    set_seed(seed)
    train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)
    # Instantiate the model (we build the model here so that the seed also control new weights initialization)
    model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)

    # We could avoid this line since the accelerator is set with `device_placement=True` (default value).
    # Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
    # creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
    model = model.to(accelerator.device)
    # Instantiate optimizer
    optimizer = AdamW(params=model.parameters(), lr=lr)

    # Instantiate scheduler
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=100,
        num_training_steps=(len(train_dataloader) * num_epochs) // gradient_accumulation_steps,
    )

    # Prepare everything
    # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
    # prepare method.

    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
    )

    # Now we train the model
    for epoch in range(num_epochs):
        model.train()
        for step, batch in enumerate(train_dataloader):
            # We could avoid this line since we set the accelerator with `device_placement=True`.
            batch.to(accelerator.device)
            outputs = model(**batch)
            loss = outputs.loss
            loss = loss / gradient_accumulation_steps
            accelerator.backward(loss)
            if step % gradient_accumulation_steps == 0:
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()

            # New code
            # Check if we should stop the training on any processes
            if callback.check_early_stopping(loss.item()):
                accelerator.set_trigger()

            # If so, we break the loop
            if accelerator.check_trigger():
                break

        model.eval()
        for step, batch in enumerate(eval_dataloader):
            # We could avoid this line since we set the accelerator with `device_placement=True`.
            batch.to(accelerator.device)
            with torch.no_grad():
                outputs = model(**batch)
            predictions = outputs.logits.argmax(dim=-1)
            predictions, references = accelerator.gather_for_metrics((predictions, batch["labels"]))
            metric.add_batch(
                predictions=predictions,
                references=references,
            )

        eval_metric = metric.compute()

        # Use accelerator.print to print only on the main process.
        accelerator.print(f"epoch {epoch}:", eval_metric)
    accelerator.end_training()


def main():
    parser = argparse.ArgumentParser(description="Simple example of training script.")
    parser.add_argument(
        "--mixed_precision",
        type=str,
        default=None,
        choices=["no", "fp16", "bf16", "fp8"],
        help="Whether to use mixed precision. Choose"
        "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
        "and an Nvidia Ampere GPU.",
    )
    parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
    args = parser.parse_args()
    config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16}
    training_function(config, args)


if __name__ == "__main__":
    main()


================================================
FILE: examples/by_feature/fsdp_with_peak_mem_tracking.py
================================================
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import gc
import os
import threading

import evaluate
import psutil
import torch
from datasets import load_dataset
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig
from torch.utils.data import DataLoader
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
    set_seed,
)

from accelerate import Accelerator, DistributedType, FullyShardedDataParallelPlugin
from accelerate.utils import is_npu_available, is_xpu_available


########################################################################
# This is a fully working simple example to use Accelerate
#
# This example trains a Bert base model on GLUE MRPC
# in any of the following settings (with the same script):
#   - single CPU or single GPU
#   - multi GPUS (using PyTorch distributed mode)
#   - (multi) TPUs
#   - fp16 (mixed-precision) or fp32 (normal precision)
#   - FSDP
#
# This example also demonstrates the checkpointing and sharding capabilities
#
# To run it in each of these various modes, follow the instructions
# in the readme for examples:
# https://github.com/huggingface/accelerate/tree/main/examples
#
########################################################################


MAX_GPU_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 32


# New Code #
# Converting Bytes to Megabytes
def b2mb(x):
    return int(x / 2**20)


# New Code #
# This context manager is used to track the peak memory usage of the process
class TorchTracemalloc:
    def __enter__(self):
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            torch.cuda.reset_max_memory_allocated()  # reset the peak gauge to zero
            self.begin = torch.cuda.memory_allocated()
        elif is_xpu_available():
            torch.xpu.empty_cache()
            torch.xpu.reset_max_memory_allocated()  # reset the peak gauge to zero
            self.begin = torch.xpu.memory_allocated()
        elif is_npu_available():
            torch.npu.empty_cache()
            torch.npu.reset_max_memory_allocated()  # reset the peak gauge to zero
            self.begin = torch.npu.memory_allocated()
        self.process = psutil.Process()

        self.cpu_begin = self.cpu_mem_used()
        self.peak_monitoring = True
        peak_monitor_thread = threading.Thread(target=self.peak_monitor_func)
        peak_monitor_thread.daemon = True
        peak_monitor_thread.start()
        return self

    def cpu_mem_used(self):
        """get resident set size memory for the current process"""
        return self.process.memory_info().rss

    def peak_monitor_func(self):
        self.cpu_peak = -1

        while True:
            self.cpu_peak = max(self.cpu_mem_used(), self.cpu_peak)

            # can't sleep or will not catch the peak right (this comment is here on purpose)
            # time.sleep(0.001) # 1msec

            if not self.peak_monitoring:
                break

    def __exit__(self, *exc):
        self.peak_monitoring = False

        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            self.end = torch.cuda.memory_allocated()
            self.peak = torch.cuda.max_memory_allocated()
        elif is_xpu_available():
            torch.xpu.empty_cache()
            self.end = torch.xpu.memory_allocated()
            self.peak = torch.xpu.max_memory_allocated()
        elif is_npu_available():
            torch.npu.empty_cache()
            self.end = torch.npu.memory_allocated()
            self.peak = torch.npu.max_memory_allocated()
        self.used = b2mb(self.end - self.begin)
        self.peaked = b2mb(self.peak - self.begin)

        self.cpu_end = self.cpu_mem_used()
        self.cpu_used = b2mb(self.cpu_end - self.cpu_begin)
        self.cpu_peaked = b2mb(self.cpu_peak - self.cpu_begin)
        # print(f"delta used/peak {self.used:4d}/{self.peaked:4d}")


# For testing only
if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
    from accelerate.test_utils.training import mocked_dataloaders

    get_dataloaders = mocked_dataloaders  # noqa: F811


def training_function(config, args):
    # For testing only
    if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
        config["num_epochs"] = 2

    # New Code #
    # Pass the advanced FSDP settings not part of the accelerate config by creating fsdp_plugin
    fsdp_plugin = FullyShardedDataParallelPlugin(
        state_dict_config=FullStateDictConfig(offload_to_cpu=False, rank0_only=False),
        optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=False, rank0_only=False),
    )

    # Initialize accelerator
    if args.with_tracking:
        accelerator = Accelerator(
            cpu=args.cpu,
            mixed_precision=args.mixed_precision,
            log_with="wandb",
            project_dir=args.logging_dir,
            fsdp_plugin=fsdp_plugin,
        )
    else:
        accelerator = Accelerator(fsdp_plugin=fsdp_plugin)
    accelerator.print(accelerator.distributed_type)

    if hasattr(args.checkpointing_steps, "isdigit"):
        if args.checkpointing_steps == "epoch":
            checkpointing_steps = args.checkpointing_steps
        elif args.checkpointing_steps.isdigit():
            checkpointing_steps = int(args.checkpointing_steps)
        else:
            raise ValueError(
                f"Argument `checkpointing_steps` must be either a number or `epoch`. `{args.checkpointing_steps}` passed."
            )
    else:
        checkpointing_steps = None
    # Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
    lr = config["lr"]
    num_epochs = int(config["num_epochs"])
    seed = int(config["seed"])
    batch_size = int(config["batch_size"])

    # We need to initialize the trackers we use, and also store our configuration
    if args.with_tracking:
        experiment_config = vars(args)
        accelerator.init_trackers("fsdp_glue_no_trainer", experiment_config)

    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
    datasets = load_dataset("glue", "mrpc")
    metric = evaluate.load("glue", "mrpc")

    def tokenize_function(examples):
        # max_length=None => use the model max length (it's actually the default)
        outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
        return outputs

    # Apply the method we just defined to all the examples in all the splits of the dataset
    # starting with the main process first:
    with accelerator.main_process_first():
        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            remove_columns=["idx", "sentence1", "sentence2"],
        )

    # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
    # transformers library
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

    # If the batch size is too big we use gradient accumulation
    gradient_accumulation_steps = 1
    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.XLA:
        gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
        batch_size = MAX_GPU_BATCH_SIZE

    def collate_fn(examples):
        # On TPU it's best to pad everything to the same length or training will be very slow.
        max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
        # When using mixed precision we want round multiples of 8/16
        if accelerator.mixed_precision == "fp8":
            pad_to_multiple_of = 16
        elif accelerator.mixed_precision != "no":
            pad_to_multiple_of = 8
        else:
            pad_to_multiple_of = None

        return tokenizer.pad(
            examples,
            padding="longest",
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors="pt",
        )

    # Instantiate dataloaders.
    train_dataloader = DataLoader(
        tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size
    )
    eval_dataloader = DataLoader(
        tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
    )

    set_seed(seed)

    # Instantiate the model (we build the model here so that the seed also control new weights initialization)
    model = AutoModelForSequenceClassification.from_pretrained(
        args.model_name_or_path, return_dict=True, low_cpu_mem_usage=True
    )

    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": 0.003,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]

    optimizer = torch.optim.AdamW(params=optimizer_grouped_parameters, lr=lr, weight_decay=2e-4)

    # Instantiate scheduler
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=10,
        num_training_steps=(len(train_dataloader) * num_epochs) // gradient_accumulation_steps,
    )

    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
    )

    overall_step = 0

    # Potentially load in the weights and states from a previous save
    if args.resume_from_checkpoint:
        if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
            accelerator.load_state(args.resume_from_checkpoint)
            path = os.path.basename(args.resume_from_checkpoint)
        else:
            # Get the most recent checkpoint
            dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
            dirs.sort(key=os.path.getctime)
            path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
        # Extract `epoch_{i}` or `step_{i}`
        training_difference = os.path.splitext(path)[0]

        if "epoch" in training_difference:
            num_epochs -= int(training_difference.replace("epoch_", ""))
            resume_step = None
        else:
            resume_step = int(training_difference.replace("step_", ""))
            num_epochs -= resume_step // len(train_dataloader)
            # If resuming by step, we also need to know exactly how far into the DataLoader we went
            resume_step = (num_epochs * len(train_dataloader)) - resume_step

    # Now we train the model
    for epoch in range(num_epochs):
        # New Code #
        # context manager to track the peak memory usage during the training epoch
        with TorchTracemalloc() as tracemalloc:
            model.train()
            if args.with_tracking:
                total_loss = 0
            for step, batch in enumerate(train_dataloader):
                # We need to skip steps until we reach the resumed step
                if args.resume_from_checkpoint and epoch == 0:
                    if resume_step is not None and step < resume_step:
                        pass
                # We could avoid this line since we set the accelerator with `device_placement=True`.
                batch.to(accelerator.device)
                outputs = model(**batch)
                loss = outputs.loss
                # We keep track of the loss at each epoch
                if args.with_tracking:
                    total_loss += loss.detach().float()
                accelerator.backward(loss)
                if step % gradient_accumulation_steps == 0:
                    optimizer.step()
                    lr_scheduler.step()
                    optimizer.zero_grad()
                    # accelerator.print(lr_scheduler.get_lr())

                overall_step += 1

                if isinstance(checkpointing_steps, int):
                    output_dir = f"step_{overall_step}"
                    if overall_step % checkpointing_steps == 0:
                        if args.output_dir is not None:
                            output_dir = os.path.join(args.output_dir, output_dir)
                        accelerator.save_state(output_dir)
        # New Code #
        # Printing the GPU memory usage details such as allocated memory, peak memory, and total memory usage
        accelerator.print(f"Memory before entering the train : {b2mb(tracemalloc.begin)}")
        accelerator.print(f"Memory consumed at the end of the train (end-begin): {tracemalloc.used}")
        accelerator.print(f"Peak Memory consumed during the train (max-begin): {tracemalloc.peaked}")
        accelerator.print(
            f"Total Peak Memory consumed during the train (max): {tracemalloc.peaked + b2mb(tracemalloc.begin)}"
        )
        # Logging the peak memory usage of the GPU to the tracker
        if args.with_tracking:
            accelerator.log(
                {
                    "train_total_peak_memory": tracemalloc.peaked + b2mb(tracemalloc.begin),
                },
                step=epoch,
            )

        # New Code #
        # context manager to track the peak memory usage during the evaluation
        with TorchTracemalloc() as tracemalloc:
            model.eval()
            for step, batch in enumerate(eval_dataloader):
                # We could avoid this line since we set the accelerator with `device_placement=True`.
                batch.to(accelerator.device)
                with torch.no_grad():
                    outputs = model(**batch)
                predictions = outputs.logits.argmax(dim=-1)
                predictions, references = accelerator.gather_for_metrics((predictions, batch["labels"]))
                metric.add_batch(
                    predictions=predictions,
                    references=references,
                )

            eval_metric = metric.compute()
            # Use accelerator.print to print only on the main process.
            accelerator.print(f"epoch {epoch}:", eval_metric)
            if args.with_tracking:
                accelerator.log(
                    {
                        "accuracy": eval_metric["accuracy"],
                        "f1": eval_metric["f1"],
                        "train_loss": total_loss.item() / len(train_dataloader),
                    },
                    step=epoch,
                )

            if checkpointing_steps == "epoch":
                output_dir = f"epoch_{epoch}"
                if args.output_dir is not None:
                    output_dir = os.path.join(args.output_dir, output_dir)
                accelerator.save_state(output_dir)
        # New Code #
        # Printing the GPU memory usage details such as allocated memory, peak memory, and total memory usage
        accelerator.print(f"Memory before entering the eval : {b2mb(tracemalloc.begin)}")
        accelerator.print(f"Memory consumed at the end of the eval (end-begin): {tracemalloc.used}")
        accelerator.print(f"Peak Memory consumed during the eval (max-begin): {tracemalloc.peaked}")
        accelerator.print(
            f"Total Peak Memory consumed during the eval (max): {tracemalloc.peaked + b2mb(tracemalloc.begin)}"
        )
        # Logging the peak memory usage of the GPU to the tracker
        if args.with_tracking:
            accelerator.log(
                {
                    "eval_total_peak_memory": tracemalloc.peaked + b2mb(tracemalloc.begin),
                },
                step=epoch,
            )

    accelerator.end_training()


def main():
    parser = argparse.ArgumentParser(description="Simple example of training script.")
    parser.add_argument(
        "--mixed_precision",
        type=str,
        default=None,
        choices=["no", "fp16", "bf16", "fp8"],
        help="Whether to use mixed precision. Choose"
        "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
        "and an Nvidia Ampere GPU.",
    )
    parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
    parser.add_argument(
        "--checkpointing_steps",
        type=str,
        default=None,
        help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
    )
    parser.add_argument(
        "--resume_from_checkpoint",
        type=str,
        default=None,
        help="If the training should continue from a checkpoint folder.",
    )
    parser.add_argument(
        "--with_tracking",
        action="store_true",
        help="Whether to load in all available experiment trackers from the environment and use them for logging.",
    )
    parser.add_argument(
        "--output_dir",
        type=str,
        default=".",
        help="Optional save directory where all checkpoint folders will be stored. Default is the current working directory.",
    )
    parser.add_argument(
        "--logging_dir",
        type=str,
        default="logs",
        help="Location on where to store experiment tracking logs`",
    )
    parser.add_argument(
        "--model_name_or_path",
        type=str,
        help="Path to pretrained model or model identifier from huggingface.co/models.",
        required=True,
    )
    args = parser.parse_args()
    config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16}
    training_function(config, args)


if __name__ == "__main__":
    main()


================================================
FILE: examples/by_feature/gradient_accumulation.py
================================================
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os

import evaluate
import torch
from datasets import load_dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed

from accelerate import Accelerator, DistributedType


########################################################################
# This is a fully working simple example to use Accelerate
# and perform gradient accumulation
#
# This example trains a Bert base model on GLUE MRPC
# in any of the following settings (with the same script):
#   - single CPU or single GPU
#   - multi GPUS (using PyTorch distributed mode)
#   - (multi) TPUs
#   - fp16 (mixed-precision) or fp32 (normal precision)
#
# To run it in each of these various modes, follow the instructions
# in the readme for examples:
# https://github.com/huggingface/accelerate/tree/main/examples
#
########################################################################


MAX_GPU_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 32


def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):
    """
    Creates a set of `DataLoader`s for the `glue` dataset,
    using "bert-base-cased" as the tokenizer.

    Args:
        accelerator (`Accelerator`):
            An `Accelerator` object
        batch_size (`int`, *optional*):
            The batch size for the train and validation DataLoaders.
    """
    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    datasets = load_dataset("glue", "mrpc")

    def tokenize_function(examples):
        # max_length=None => use the model max length (it's actually the default)
        outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
        return outputs

    # Apply the method we just defined to all the examples in all the splits of the dataset
    # starting with the main process first:
    with accelerator.main_process_first():
        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            remove_columns=["idx", "sentence1", "sentence2"],
        )

    # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
    # transformers library
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

    def collate_fn(examples):
        # On TPU it's best to pad everything to the same length or training will be very slow.
        max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
        # When using mixed precision we want round multiples of 8/16
        if accelerator.mixed_precision == "fp8":
            pad_to_multiple_of = 16
        elif accelerator.mixed_precision != "no":
            pad_to_multiple_of = 8
        else:
            pad_to_multiple_of = None

        return tokenizer.pad(
            examples,
            padding="longest",
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors="pt",
        )

    # Instantiate dataloaders.
    train_dataloader = DataLoader(
        tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size
    )
    eval_dataloader = DataLoader(
        tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
    )

    return train_dataloader, eval_dataloader


# For testing only
if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
    from accelerate.test_utils.training import mocked_dataloaders

    get_dataloaders = mocked_dataloaders  # noqa: F811


def training_function(config, args):
    # For testing only
    if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
        config["num_epochs"] = 2
    # New Code #
    gradient_accumulation_steps = int(args.gradient_accumulation_steps)
    # Initialize accelerator
    accelerator = Accelerator(
        cpu=args.cpu, mixed_precision=args.mixed_precision, gradient_accumulation_steps=gradient_accumulation_steps
    )
    if accelerator.distributed_type == DistributedType.XLA and gradient_accumulation_steps > 1:
        raise NotImplementedError(
            "Gradient accumulation on TPUs is currently not supported. Pass `gradient_accumulation_steps=1`"
        )
    # Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
    lr = config["lr"]
    num_epochs = int(config["num_epochs"])
    seed = int(config["seed"])
    batch_size = int(config["batch_size"])

    metric = evaluate.load("glue", "mrpc")

    set_seed(seed)
    train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)
    # Instantiate the model (we build the model here so that the seed also control new weights initialization)
    model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)

    # We could avoid this line since the accelerator is set with `device_placement=True` (default value).
    # Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
    # creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
    model = model.to(accelerator.device)

    # Instantiate optimizer
    optimizer = AdamW(params=model.parameters(), lr=lr)

    # Instantiate scheduler
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=100,
        num_training_steps=(len(train_dataloader) * num_epochs),
    )

    # Prepare everything
    # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
    # prepare method.
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
    )

    # Now we train the model
    for epoch in range(num_epochs):
        model.train()
        for step, batch in enumerate(train_dataloader):
            # We could avoid this line since we set the accelerator with `device_placement=True`.
            batch.to(accelerator.device)
            # New code #
            # We use the new `accumulate` context manager to perform gradient accumulation
            # We also currently do not support TPUs nor advise it as bugs were found on the XLA side when running our tests.
            with accelerator.accumulate(model):
                output = model(**batch)
                loss = output.loss
                accelerator.backward(loss)
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()

        model.eval()
        for step, batch in enumerate(eval_dataloader):
            # We could avoid this line since we set the accelerator with `device_placement=True`.
            batch.to(accelerator.device)
            with torch.no_grad():
                outputs = model(**batch)
            predictions = outputs.logits.argmax(dim=-1)
            predictions, references = accelerator.gather_for_metrics((predictions, batch["labels"]))
            metric.add_batch(
                predictions=predictions,
                references=references,
            )

        eval_metric = metric.compute()
        # Use accelerator.print to print only on the main process.
        accelerator.print(f"epoch {epoch}:", eval_metric)
    accelerator.end_training()


def main():
    parser = argparse.ArgumentParser(description="Simple example of training script.")
    parser.add_argument(
        "--mixed_precision",
        type=str,
        default=None,
        choices=["no", "fp16", "bf16", "fp8"],
        help="Whether to use mixed precision. Choose"
        "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
        "and an Nvidia Ampere GPU.",
    )
    # New Code #
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="The number of minibatches to be ran before gradients are accumulated.",
    )
    parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
    args = parser.parse_args()
    config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16}
    training_function(config, args)


if __name__ == "__main__":
    main()


================================================
FILE: examples/by_feature/gradient_accumulation_for_autoregressive_models.py
================================================
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import contextlib
import math
import os

import torch
from datasets import load_dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, get_constant_schedule, set_seed

from accelerate import Accelerator, DistributedType


########################################################################
# This is a fully working simple example to use Accelerate
# and perform gradient accumulation on samples of variable size
#
# This example trains a SmolLM base model on WikiText-2 v1
# in any of the following settings (with the same script):
#   - single CPU or single GPU
#   - multi GPUS (using PyTorch distributed mode)
#   - (multi) TPUs
#   - fp16 (mixed-precision) or fp32 (normal precision)
#
# To run it in each of these various modes, follow the instructions
# in the readme for examples:
# https://github.com/huggingface/accelerate/tree/main/examples
#
########################################################################


EVAL_BATCH_SIZE = 32


def get_dataloaders(accelerator: Accelerator, batch_size: int = 16, max_training_samples=500):
    """
    Creates a set of `DataLoader`s for the `Salesforce/wikitext` dataset,
    using "HuggingFaceTB/SmolLM-360M" as the tokenizer.

    Args:
        accelerator (`Accelerator`):
            An `Accelerator` object
        batch_size (`int`, *optional*):
            The batch size for the train and validation DataLoaders.
    """
    tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-360M")
    tokenizer.pad_token = tokenizer.eos_token
    with accelerator.local_main_process_first():
        datasets = load_dataset("Salesforce/wikitext", "wikitext-2-v1")
        datasets["train"] = datasets["train"].select(range(max_training_samples))

    def tokenize_function(examples):
        # max_length=None => use the model max length (it's actually the default)
        outputs = tokenizer(examples["text"], truncation=True, max_length=None, return_attention_mask=False)
        return outputs

    # Filter out empty texts
    with accelerator.main_process_first():
        datasets = datasets.filter(
            lambda x: len(x) > 0,
            input_columns="text",
        )

    # Apply the method we just defined to all the examples in all the splits of the dataset
    # starting with the main process first:
    with accelerator.main_process_first():
        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            remove_columns=["text"],
        )

    # Filter out empty samples
    with accelerator.main_process_first():
        tokenized_datasets = tokenized_datasets.filter(
            lambda x: len(x) > 0,
            input_columns="input_ids",
        )

    def collate_fn(examples):
        # On TPU it's best to pad everything to the same length or training will be very slow.
        max_length = (
            128
            if accelerator.distributed_type == DistributedType.XLA
            else max([len(e["input_ids"]) for e in examples])
        )
        # When using mixed precision we want round multiples of 8/16
        if accelerator.mixed_precision == "fp8":
            pad_to_multiple_of = 16
        elif accelerator.mixed_precision != "no":
            pad_to_multiple_of = 8
        else:
            pad_to_multiple_of = None

        batch = tokenizer.pad(
            examples,
            padding="max_length",
            max_length=max_length + 1,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors="pt",
        )

        batch["labels"] = batch["input_ids"][:, 1:]
        batch["input_ids"] = batch["input_ids"][:, :-1]
        if "attention_mask" in batch:
            batch["attention_mask"] = batch["attention_mask"][:, :-1]

        batch["labels"] = torch.where(batch["labels"] == tokenizer.pad_token_id, -100, batch["labels"])

        return batch

    # Instantiate dataloaders.
    train_dataloader = DataLoader(
        tokenized_datasets["train"], shuffle=False, collate_fn=collate_fn, batch_size=batch_size
    )
    eval_dataloader = DataLoader(
        tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
    )

    return train_dataloader, eval_dataloader


# For testing only
if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
    from accelerate.test_utils.training import mocked_dataloaders_for_autoregressive_models

    get_dataloaders = mocked_dataloaders_for_autoregressive_models  # noqa: F811


def training_function(config, args):
    # For testing only
    if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
        config["num_epochs"] = 2

    gradient_accumulation_steps = int(args.gradient_accumulation_steps)
    # Initialize accelerator
    if args.with_wandb_tracking:
        accelerator = Accelerator(
            cpu=args.cpu,
            mixed_precision=args.mixed_precision,
            gradient_accumulation_steps=gradient_accumulation_steps,
            log_with="wandb",
        )
    else:
        accelerator = Accelerator(
            cpu=args.cpu, mixed_precision=args.mixed_precision, gradient_accumulation_steps=gradient_accumulation_steps
        )
    if accelerator.distributed_type == DistributedType.XLA and gradient_accumulation_steps > 1:
        raise NotImplementedError(
            "Gradient accumulation on TPUs is currently not supported. Pass `gradient_accumulation_steps=1`"
        )
    # Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
    lr = config["lr"]
    num_epochs = int(config["num_epochs"])
    seed = int(config["seed"])
    batch_size = int(config["batch_size"])
    max_grad_norm = config["max_grad_norm"]

    # We need to initialize the trackers we use, and also store our configuration
    if args.with_wandb_tracking:
        run = os.path.split(__file__)[-1].split(".")[0]
        run_name = f"{accelerator.num_processes}GPU-grad{gradient_accumulation_steps}-bs{batch_size}"
        accelerator.init_trackers(
            run,
            config,
            init_kwargs={"wandb": {"name": run_name}},
        )

    set_seed(seed)
    train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)
    # Instantiate the model (we build the model here so that the seed also control new weights initialization)
    model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-360M")

    # We could avoid this line since the accelerator is set with `device_placement=True` (default value).
    # Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
    # creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
    model = model.to(accelerator.device)

    # Instantiate optimizer
    optimizer = AdamW(params=model.parameters(), lr=lr)

    # Instantiate scheduler
    lr_scheduler = get_constant_schedule(
        optimizer=optimizer,
    )

    # Prepare everything
    # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
    # prepare method.
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
    )

    num_samples_in_epoch = len(train_dataloader)
    remainder = num_samples_in_epoch % gradient_accumulation_steps
    remainder = remainder if remainder != 0 else gradient_accumulation_steps
    total_gradient_updates = math.ceil(num_samples_in_epoch / gradient_accumulation_steps)

    total_batched_samples = 0
    # Now we train the model
    for epoch in range(num_epochs):
        model.train()
        training_iterator = iter(train_dataloader)
        for update_step in range(total_gradient_updates):
            # In order to correctly the total number of non-padded tokens on which we'll compute the cross-entropy loss
            # we need to pre-load the full local batch - i.e the next per_device_batch_size * accumulation_steps samples
            batch_samples = []
            num_batches_in_step = (
                gradient_accumulation_steps if update_step != (total_gradient_updates - 1) else remainder
            )
            for _ in range(num_batches_in_step):
                batch_samples += [next(training_iterator)]
            # get local num items in batch
            local_num_items_in_batch = sum([(batch["labels"].ne(-100)).sum() for batch in batch_samples])

            # to compute it correctly in a multi-device DDP training, we need to gather the total number of items in the full batch.
            num_items_in_batch = accelerator.gather(local_num_items_in_batch).sum().item()
            losses = []
            for i, batch in enumerate(batch_samples):
                # if we perform gradient accumulation in a multi-devices set-up, we want to avoid unecessary communications when accumulating
                # cf: https://muellerzr.github.io/blog/gradient_accumulation.html
                ctx = (
                    model.no_sync
                    if (i < len(batch_samples) - 1 and accelerator.num_processes > 1)
                    else contextlib.nullcontext
                )
                with ctx():
                    total_batched_samples += 1

                    outputs = model(**batch, use_cache=False, num_items_in_batch=num_items_in_batch)
                    loss = outputs.loss

                    # We multiply by num_processes because the DDP calculates the average gradient across all devices whereas dividing by num_items_in_batch already takes into account all devices
                    # Same reason for gradient_accumulation_steps, but this times it's Accelerate that calculate the average gradient across the accumulated steps
                    # Because the loss is already divided by `num_items_in_batch` in the `transformers` code, we don't need to do it again
                    loss = loss * gradient_accumulation_steps * accelerator.num_processes
                    accelerator.backward(loss)
                    losses.append(loss.detach())

            # Sync gradients and perform optimization steps once every gradient_accumulation_steps
            grad_norm = accelerator.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

            losses = accelerator.gather(sum(losses)).sum().item() / (
                accelerator.num_processes * gradient_accumulation_steps
            )

            grad_norm = grad_norm.detach().item() if isinstance(grad_norm, torch.Tensor) else grad_norm
            accelerator.print(
                f"epoch {epoch} - update step {update_step}:: grad norm: {grad_norm} ::train loss: {losses}"
            )
            if args.with_wandb_tracking:
                accelerator.log(
                    {
                        "train/grad_norm": grad_norm,
                        "train/epoch": epoch,
                        "train/loss": losses,
                    },
                    step=update_step + total_gradient_updates * epoch,
                )
        model.eval()
        losses = []
        for step, batch in enumerate(eval_dataloader):
            with torch.no_grad():
                outputs = model(**batch, use_cache=False)
            eval_loss = outputs.loss
            losses.append(accelerator.gather_for_metrics(loss.repeat(EVAL_BATCH_SIZE)))

        losses = torch.cat(losses)
        try:
            eval_loss = torch.mean(losses)
            perplexity = math.exp(eval_loss)
        except OverflowError:
            perplexity = float("inf")

        # Use accelerator.print to print only on the main process.
        accelerator.print(f"epoch {epoch}:: eval perplexity: {perplexity} eval_loss: {eval_loss}")
        if args.with_wandb_tracking:
            accelerator.log(
                {
                    "eval/perplexity": perplexity,
                    "eval/loss": eval_loss,
                    "eval/epoch": epoch,
                },
                step=update_step + total_gradient_updates * epoch,
            )
    accelerator.end_training()


def main():
    parser = argparse.ArgumentParser(description="Simple example of training script.")
    parser.add_argument(
        "--mixed_precision",
        type=str,
        default=None,
        choices=["no", "fp16", "bf16", "fp8"],
        help="Whether to use mixed precision. Choose"
        "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
        "and an Nvidia Ampere GPU.",
    )

    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="The number of minibatches to be ran before gradients are accumulated.",
    )
    parser.add_argument(
        "--per_device_batch_size",
        type=int,
        default=2,
        help="The size of each minibatch",
    )

    parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
    parser.add_argument(
        "--with_wandb_tracking",
        action="store_true",
        help="Whether to load in wandb from the environment and use them for logging.",
    )
    args = parser.parse_args()
    config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": args.per_device_batch_size, "max_grad_norm": 1.0}
    training_function(config, args)


if __name__ == "__main__":
    main()


================================================
FILE: examples/by_feature/local_sgd.py
================================================
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os

import evaluate
import torch
from datasets import load_dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed

from accelerate import Accelerator, DistributedType
from accelerate.local_sgd import LocalSGD


########################################################################
# This is a fully working simple example to use Accelerate
# with LocalSGD, which is a method to synchronize model
# parameters every K batches. It is different, but complementary
# to gradient accumulation.
#
# This example trains a Bert base model on GLUE MRPC
# in any of the following settings (with the same script):
#   - single CPU or single GPU
#   - multi GPUS (using PyTorch distributed mode)
#   - (multi) TPUs
#   - fp16 (mixed-precision) or fp32 (normal precision)
#
# To run it in each of these various modes, follow the instructions
# in the readme for examples:
# https://github.com/huggingface/accelerate/tree/main/examples
#
########################################################################


MAX_GPU_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 32


def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):
    """
    Creates a set of `DataLoader`s for the `glue` dataset,
    using "bert-base-cased" as the tokenizer.

    Args:
        accelerator (`Accelerator`):
            An `Accelerator` object
        batch_size (`int`, *optional*):
            The batch size for the train and validation DataLoaders.
    """
    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    datasets = load_dataset("glue", "mrpc")

    def tokenize_function(examples):
        # max_length=None => use the model max length (it's actually the default)
        outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
        return outputs

    # Apply the method we just defined to all the examples in all the splits of the dataset
    # starting with the main process first:
    with accelerator.main_process_first():
        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            remove_columns=["idx", "sentence1", "sentence2"],
        )

    # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
    # transformers library
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

    def collate_fn(examples):
        # On TPU it's best to pad everything to the same length or training will be very slow.
        max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
        # When using mixed precision we want round multiples of 8/16
        if accelerator.mixed_precision == "fp8":
            pad_to_multiple_of = 16
        elif accelerator.mixed_precision != "no":
            pad_to_multiple_of = 8
        else:
            pad_to_multiple_of = None

        return tokenizer.pad(
            examples,
            padding="longest",
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors="pt",
        )

    # Instantiate dataloaders.
    train_dataloader = DataLoader(
        tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size
    )
    eval_dataloader = DataLoader(
        tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
    )

    return train_dataloader, eval_dataloader


# For testing only
if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
    from accelerate.test_utils.training import mocked_dataloaders

    get_dataloaders = mocked_dataloaders  # noqa: F811


def training_function(config, args):
    # For testing only
    if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
        config["num_epochs"] = 2
    # New Code #
    gradient_accumulation_steps = int(args.gradient_accumulation_steps)
    local_sgd_steps = int(args.local_sgd_steps)
    # Initialize accelerator
    accelerator = Accelerator(
        cpu=args.cpu, mixed_precision=args.mixed_precision, gradient_accumulation_steps=gradient_accumulation_steps
    )
    # Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
    lr = config["lr"]
    num_epochs = int(config["num_epochs"])
    seed = int(config["seed"])
    batch_size = int(config["batch_size"])

    metric = evaluate.load("glue", "mrpc")

    set_seed(seed)
    train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)
    # Instantiate the model (we build the model here so that the seed also control new weights initialization)
    model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)

    # We could avoid this line since the accelerator is set with `device_placement=True` (default value).
    # Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
    # creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
    model = model.to(accelerator.device)

    # Instantiate optimizer
    optimizer = AdamW(params=model.parameters(), lr=lr)

    # Instantiate scheduler
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=100,
        num_training_steps=(len(train_dataloader) * num_epochs),
    )

    # Prepare everything
    # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
    # prepare method.
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
    )

    # Now we train the model
    for epoch in range(num_epochs):
        model.train()
        with LocalSGD(
            accelerator=accelerator, model=model, local_sgd_steps=local_sgd_steps, enabled=local_sgd_steps is not None
        ) as local_sgd:
            for step, batch in enumerate(train_dataloader):
                # We could avoid this line since we set the accelerator with `device_placement=True`.
                batch.to(accelerator.device)
                # New code #
                # We use the new `accumulate` context manager to perform gradient accumulation
                # We also currently do not support TPUs nor advise it as bugs were found on the XLA side when running our tests.
                with accelerator.accumulate(model):
                    output = model(**batch)
                    loss = output.loss
                    accelerator.backward(loss)
                    optimizer.step()
                    lr_scheduler.step()
                    optimizer.zero_grad()
                    # LocalSGD-specific line
                    local_sgd.step()

        model.eval()
        for step, batch in enumerate(eval_dataloader):
            # We could avoid this line since we set the accelerator with `device_placement=True`.
            batch.to(accelerator.device)
            with torch.no_grad():
                outputs = model(**batch)
            predictions = outputs.logits.argmax(dim=-1)
            predictions, references = accelerator.gather_for_metrics((predictions, batch["labels"]))
            metric.add_batch(
                predictions=predictions,
                references=references,
            )

        eval_metric = metric.compute()
        # Use accelerator.print to print only on the main process.
        accelerator.print(f"epoch {epoch}:", eval_metric)
    accelerator.end_training()


def main():
    parser = argparse.ArgumentParser(description="Simple example of training script.")
    parser.add_argument(
        "--mixed_precision",
        type=str,
        default=None,
        choices=["no", "fp16", "bf16", "fp8"],
        help="Whether to use mixed precision. Choose"
        "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
        "and an Nvidia Ampere GPU.",
    )
    # New Code #
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="The number of minibatches to be ran before gradients are accumulated.",
    )
    parser.add_argument(
        "--local_sgd_steps", type=int, default=8, help="Number of local SGD steps or None to disable local SGD"
    )
    parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
    args = parser.parse_args()
    config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16}
    training_function(config, args)


if __name__ == "__main__":
    main()


================================================
FILE: examples/by_feature/megatron_lm_gpt_pretraining.py
================================================
#!/usr/bin/env python
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...)
on a text file or a dataset without using HuggingFace Trainer.

Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
https://huggingface.co/models?filter=text-generation
"""
# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.

import argparse
import json
import logging
import math
import os
import random
from itertools import chain
from pathlib import Path

import datasets
import torch
import transformers
from datasets import load_dataset
from huggingface_hub import HfApi
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from transformers import (
    CONFIG_MAPPING,
    MODEL_MAPPING,
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    SchedulerType,
    default_data_collator,
    get_scheduler,
)
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version

from accelerate import Accelerator, DistributedType, init_empty_weights
from accelerate.logging import get_logger
from accelerate.utils import MegatronLMDummyScheduler, set_seed


# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.23.0.dev0")

logger = get_logger(__name__)

require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)


def parse_args():
    parser = argparse.ArgumentParser(description="Finetune a transformers model on a causal language modeling task")
    parser.add_argument(
        "--dataset_name",
        type=str,
        default=None,
        help="The name of the dataset to use (via the datasets library).",
    )
    parser.add_argument(
        "--dataset_config_name",
        type=str,
        default=None,
        help="The configuration name of the dataset to use (via the datasets library).",
    )
    parser.add_argument(
        "--train_file", type=str, default=None, help="A csv or a json file containing the training data."
    )
    parser.add_argument(
        "--validation_file", type=str, default=None, help="A csv or a json file containing the validation data."
    )
    parser.add_argument(
        "--validation_split_percentage",
        default=5,
        help="The percentage of the train set used as validation set in case there's no validation split",
    )
    parser.add_argument(
        "--model_name_or_path",
        type=str,
        help="Path to pretrained model or model identifier from huggingface.co/models.",
        required=False,
    )
    parser.add_argument(
        "--config_name",
        type=str,
        default=None,
        help="Pretrained config name or path if not the same as model_name",
    )
    parser.add_argument(
        "--tokenizer_name",
        type=str,
        default=None,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )
    parser.add_argument(
        "--use_slow_tokenizer",
        action="store_true",
        help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
    )
    parser.add_argument(
        "--per_device_train_batch_size",
        type=int,
        default=8,
        help="Batch size (per device) for the training dataloader.",
    )
    parser.add_argument(
        "--per_device_eval_batch_size",
        type=int,
        default=8,
        help="Batch size (per device) for the evaluation dataloader.",
    )
    parser.add_argument(
        "--learning_rate",
        type=float,
        default=5e-5,
        help="Initial learning rate (after the potential warmup period) to use.",
    )
    parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
    parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_train_steps",
        type=int,
        default=None,
        help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
    )
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument(
        "--lr_scheduler_type",
        type=SchedulerType,
        default="linear",
        help="The scheduler type to use.",
        choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"],
    )
    parser.add_argument(
        "--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler."
    )
    parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.")
    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
    parser.add_argument(
        "--model_type",
        type=str,
        default=None,
        help="Model type to use if training from scratch.",
        choices=MODEL_TYPES,
    )
    parser.add_argument(
        "--block_size",
        type=int,
        default=None,
        help=(
            "Optional input sequence length after tokenization. The training dataset will be truncated in block of"
            " this size for training. Default to the model max input length for single sentence inputs (take into"
            " account special tokens)."
        ),
    )
    parser.add_argument(
        "--preprocessing_num_workers",
        type=int,
        default=None,
        help="The number of processes to use for the preprocessing.",
    )
    parser.add_argument(
        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
    )
    parser.add_argument(
        "--no_keep_linebreaks", action="store_true", help="Do not keep line breaks when using TXT files."
    )
    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
    parser.add_argument(
        "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
    )
    parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
    parser.add_argument(
        "--checkpointing_steps",
        type=str,
        default=None,
        help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
    )
    parser.add_argument(
        "--resume_from_checkpoint",
        type=str,
        default=None,
        help="If the training should continue from a checkpoint folder.",
    )
    parser.add_argument(
        "--initial_megatron_lm_checkpoint",
        type=str,
        default=None,
        help="If the training should start from a Megatron-LM checkpoint.",
    )
    parser.add_argument(
        "--with_tracking",
        action="store_true",
        help="Whether to enable experiment trackers for logging.",
    )
    parser.add_argument(
        "--report_to",
        type=str,
        default="all",
        help=(
            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
            ' `"wandb"`, `"comet_ml"`, and `"dvclive"`, and `"swanlab"`. Use `"all"` (default) to report to all integrations.'
            "Only applicable when `--with_tracking` is passed."
        ),
    )
    args = parser.parse_args()

    # Sanity checks
    if args.dataset_name is None and args.train_file is None and args.validation_file is None:
        raise ValueError("Need either a dataset name or a training/validation file.")
    else:
        if args.train_file is not None:
            extension = args.train_file.split(".")[-1]
            assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, json or txt file."
        if args.validation_file is not None:
            extension = args.validation_file.split(".")[-1]
            assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, json or txt file."

    if args.push_to_hub:
        assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed."

    return args


def main():
    args = parse_args()

    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
    # information sent is the one passed as arguments along with your Python/PyTorch versions.
    send_example_telemetry("run_clm_no_trainer", args)

    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
    # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
    # in the environment
    accelerator_log_kwargs = {}

    if args.with_tracking:
        accelerator_log_kwargs["log_with"] = args.report_to
        accelerator_log_kwargs["project_dir"] = args.output_dir

    accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)

    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.info(accelerator.state, main_process_only=False)
    if accelerator.is_local_main_process:
        datasets.utils.logging.set_verbosity_warning()
        transformers.utils.logging.set_verbosity_info()
    else:
        datasets.utils.logging.set_verbosity_error()
        transformers.utils.logging.set_verbosity_error()

    # If passed along, set the training seed now.
    if args.seed is not None:
        set_seed(args.seed)

    # Handle the repository creation
    if accelerator.is_main_process:
        if args.push_to_hub:
            api = HfApi(token=args.hub_token)

            # Create repo (repo_name from args or inferred)
            repo_name = args.hub_model_id
            if repo_name is None:
                repo_name = Path(args.output_dir).absolute().name
            repo_id = api.create_repo(repo_name, exist_ok=True).repo_id

            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                if "step_*" not in gitignore:
                    gitignore.write("step_*\n")
                if "epoch_*" not in gitignore:
                    gitignore.write("epoch_*\n")
        elif args.output_dir is not None:
            os.makedirs(args.output_dir, exist_ok=True)
    accelerator.wait_for_everyone()

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
        if "validation" not in raw_datasets.keys():
            raw_datasets["validation"] = load_dataset(
                args.dataset_name,
                args.dataset_config_name,
                split=f"train[:{args.validation_split_percentage}%]",
            )
            raw_datasets["train"] = load_dataset(
                args.dataset_name,
                args.dataset_config_name,
                split=f"train[{args.validation_split_percentage}%:]",
            )
    else:
        data_files = {}
        dataset_args = {}
        if args.train_file is not None:
            data_files["train"] = args.train_file
        if args.validation_file is not None:
            data_files["validation"] = args.validation_file
        extension = args.train_file.split(".")[-1]
        if extension == "txt":
            extension = "text"
            dataset_args["keep_linebreaks"] = not args.no_keep_linebreaks
        raw_datasets = load_dataset(extension, data_files=data_files, **dataset_args)
        # If no validation data is there, validation_split_percentage will be used to divide the dataset.
        if "validation" not in raw_datasets.keys():
            raw_datasets["validation"] = load_dataset(
                extension,
                data_files=data_files,
                split=f"train[:{args.validation_split_percentage}%]",
                **dataset_args,
            )
            raw_datasets["train"] = load_dataset(
                extension,
                data_files=data_files,
                split=f"train[{args.validation_split_percentage}%:]",
                **dataset_args,
            )

    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Load pretrained model and tokenizer
    #
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    if args.config_name:
        config = AutoConfig.from_pretrained(args.config_name)
    elif args.model_name_or_path:
        config = AutoConfig.from_pretrained(args.model_name_or_path)
    else:
        config = CONFIG_MAPPING[args.model_type]()
        logger.warning("You are instantiating a new config instance from scratch.")

    if args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer)
    elif args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    if args.model_name_or_path:
        # if we are using Megatron-LM, we can use init_empty_weights to load the model without initializing the weights
        # since the weights are loaded later.
        if args.resume_from_checkpoint is not None or args.initial_megatron_lm_checkpoint is not None:
            assert config is not None, "config should not be None for Megatron-LM"
            with init_empty_weights():
                model = AutoModelForCausalLM.from_config(config)
        else:
            model = AutoModelForCausalLM.from_pretrained(
                args.model_name_or_path,
                from_tf=bool(".ckpt" in args.model_name_or_path),
                config=config,
            )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelForCausalLM.from_config(config)

    model.resize_token_embeddings(len(tokenizer))

    # Preprocessing the datasets.
    # First we tokenize all the texts.
    column_names = raw_datasets["train"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

    def tokenize_function(examples):
        return tokenizer(examples[text_column_name])

    with accelerator.main_process_first():
        tokenized_datasets = raw_datasets.map(
            tokenize_function,
            batched=True,
            num_proc=args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not args.overwrite_cache,
            desc="Running tokenizer on dataset",
        )

    if args.block_size is None:
        block_size = tokenizer.model_max_length
        if block_size > 1024:
            logger.warning(
                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
                "Picking 1024 instead. You can change that default value by passing --block_size xxx."
            )
            block_size = 1024
    else:
        if args.block_size > tokenizer.model_max_length:
            logger.warning(
                f"The block_size passed ({args.block_size}) is larger than the maximum length for the model"
                f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
            )
        block_size = min(args.block_size, tokenizer.model_max_length)

    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
    def group_texts(examples):
        # Concatenate all texts.
        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
        if total_length >= block_size:
            total_length = (total_length // block_size) * block_size
        # Split by chunks of max_len.
        result = {
            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
            for k, t in concatenated_examples.items()
        }
        result["labels"] = result["input_ids"].copy()
        return result

    # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
    # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
    # to preprocess.
    #
    # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
    # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map

    with accelerator.main_process_first():
        lm_datasets = tokenized_datasets.map(
            group_texts,
            batched=True,
            num_proc=args.preprocessing_num_workers,
            load_from_cache_file=not args.overwrite_cache,
            desc=f"Grouping texts in chunks of {block_size}",
        )

    train_dataset = lm_datasets["train"]
    eval_dataset = lm_datasets["validation"]

    # Log a few random samples from the training set:
    for index in random.sample(range(len(train_dataset)), 3):
        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")

    # DataLoaders creation:
    train_dataloader = DataLoader(
        train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=args.per_device_train_batch_size
    )
    eval_dataloader = DataLoader(
        eval_dataset, collate_fn=default_data_collator, batch_size=args.per_device_eval_batch_size
    )

    # Optimizer
    # Split weights in two groups, one with weight decay and the other not.
    no_decay = ["bias", "layer_norm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

    # Scheduler and math around the number of training steps.
    overrode_max_train_steps = False
    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
    if args.max_train_steps is None:
        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
        overrode_max_train_steps = True

    # New Code
    # For Megatron-LM, we need to use `MegatronLMDummyScheduler` instead of regular schedulers
    if accelerator.distributed_type == DistributedType.MEGATRON_LM:
        lr_scheduler = MegatronLMDummyScheduler(
            optimizer=optimizer,
            total_num_steps=args.max_train_steps,
            warmup_num_steps=args.num_warmup_steps,
        )
    else:
        lr_scheduler = get_scheduler(
            name=args.lr_scheduler_type,
            optimizer=optimizer,
            num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
            num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
        )

    # Prepare everything with our `accelerator`.
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
    )

    # On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
    if accelerator.distributed_type == DistributedType.XLA:
        model.tie_weights()

    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
    if overrode_max_train_steps:
        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
    # Afterwards we recalculate our number of training epochs
    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)

    # Figure out how many steps we should save the Accelerator states
    checkpointing_steps = args.checkpointing_steps
    if checkpointing_steps is not None and checkpointing_steps.isdigit():
        checkpointing_steps = int(checkpointing_steps)

    # We need to initialize the trackers we use, and also store our configuration.
    # The trackers initializes automatically on the main process.
    if args.with_tracking:
        experiment_config = vars(args)
        # TensorBoard cannot log Enums, need the raw value
        experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value
        accelerator.init_trackers("clm_no_trainer", experiment_config)

    # Train!
    # New Code
    # For Megatron-LM, we need to get `global_batch_size` from megatron_lm_plugin
    # as it handles the specifics related to data parallelism, tensor model parallelism and pipeline parallelism
    if accelerator.distributed_type == DistributedType.MEGATRON_LM:
        total_batch_size = accelerator.state.megatron_lm_plugin.global_batch_size
    else:
        total_batch_size = (
            args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
        )

    logger.info("***** Running training *****")
    logger.info(f"  Num examples = {len(train_dataset)}")
    logger.info(f"  Num Epochs = {args.num_train_epochs}")
    logger.info(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
    logger.info(f"  Total optimization steps = {args.max_train_steps}")
    # Only show the progress bar once on each machine.
    progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
    completed_steps = 0
    starting_epoch = 0

    # Potentially load in the weights and states from a previous save
    if args.resume_from_checkpoint:
        if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
            accelerator.load_state(args.resume_from_checkpoint)
            path = os.path.basename(args.resume_from_checkpoint)
        else:
            # Get the most recent checkpoint
            dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
            dirs.sort(key=os.path.getctime)
            path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
        # Extract `epoch_{i}` or `step_{i}`
        training_difference = os.path.splitext(path)[0]

        if "epoch" in training_difference:
            starting_epoch = int(training_difference.replace("epoch_", "")) + 1
            resume_step = None
        else:
            # need to multiply `gradient_accumulation_steps` to reflect real steps
            resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
            starting_epoch = resume_step // len(train_dataloader)
            resume_step -= starting_epoch * len(train_dataloader)

    if args.initial_megatron_lm_checkpoint:
        assert accelerator.distributed_type == DistributedType.MEGATRON_LM, (
            "initial_megatron_lm_checkpoint should only be used with Megatron-LM"
        )
        assert args.resume_from_checkpoint is None, (
            "resume_from_checkpoint should not be provided when initial_megatron_lm_checkpoint is provided"
        )
        accelerator.print(
            f"Loading Megatron-LM checkpoint from the initial checkpoint (directly from the release directory converted using megatron bridge): {args.initial_megatron_lm_checkpoint}"
        )
        checkpoint_dir = args.initial_megatron_lm_checkpoint
        latest_iter_file = os.path.join(checkpoint_dir, "latest_checkpointed_iteration.txt")
        assert os.path.isfile(latest_iter_file), f"{latest_iter_file} does not exist in {checkpoint_dir}"
        with open(latest_iter_file) as f:
            contents = f.read().strip()
        assert contents == "0", (
            f"latest_checkpointed_iteration.txt in {checkpoint_dir} must contain only '0' (found '{contents}'), please mannually change it to '0' and rename the directory release to iter_0000000, also make sure megatron_lm_no_load_optim is set to true in the config file"
        )
        # Also assert iter_0000000 directory exists
        iter0_dir = os.path.join(checkpoint_dir, "iter_0000000")
        assert os.path.isdir(iter0_dir), (
            f"{iter0_dir} directory does not exist in {checkpoint_dir}, please rename the release directory to iter_0000000"
        )
        accelerator.load_state(args.initial_megatron_lm_checkpoint)
    # update the progress_bar if load from checkpoint
    progress_bar.update(starting_epoch * num_update_steps_per_epoch)
    completed_steps = starting_epoch * num_update_steps_per_epoch

    for epoch in range(starting_epoch, args.num_train_epochs):
        model.train()
        if args.with_tracking:
            total_loss = 0
        for step, batch in enumerate(train_dataloader):
            # We need to skip steps until we reach the resumed step
            if args.resume_from_checkpoint and epoch == starting_epoch:
                if resume_step is not None and step < resume_step:
                    if step % args.gradient_accumulation_steps == 0:
                        progress_bar.update(1)
                        completed_steps += 1
                    continue

            with accelerator.accumulate(model):
                outputs = model(**batch)
                loss = outputs.loss
                # We keep track of the loss at each epoch
                if args.with_tracking:
                    total_loss += loss.detach().float()
                accelerator.backward(loss)
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()

            # Checks if the accelerator has performed an optimization step behind the scenes
            if accelerator.sync_gradients:
                progress_bar.update(1)
                completed_steps += 1

            if isinstance(checkpointing_steps, int):
                if completed_steps % checkpointing_steps == 0:
                    output_dir = f"step_{completed_steps}"
                    if args.output_dir is not None:
                        output_dir = os.path.join(args.output_dir, output_dir)
                    accelerator.save_state(output_dir)
            if completed_steps >= args.max_train_steps:
                break

        model.eval()
        losses = []
        for step, batch in enumerate(eval_dataloader):
            with torch.no_grad():
                outputs = model(**batch)

            loss = outputs.loss
            # New Code
            # For Megatron-LM, the losses are already averaged across the data parallel group
            if accelerator.distributed_type == DistributedType.MEGATRON_LM:
                losses.append(loss)
            else:
                losses.append(accelerator.gather_for_metrics(loss.repeat(args.per_device_eval_batch_size)))
        try:
            if accelerator.distributed_type == DistributedType.MEGATRON_LM:
                losses = torch.tensor(losses)
            else:
                losses = torch.cat(losses)
            eval_loss = torch.mean(losses)
            perplexity = math.exp(eval_loss)
        except OverflowError:
            perplexity = float("inf")

        logger.info(f"epoch {epoch}: perplexity: {perplexity} eval_loss: {eval_loss}")

        if args.with_tracking:
            accelerator.log(
                {
                    "perplexity": perplexity,
                    "eval_loss": eval_loss,
                    "train_loss": total_loss.item() / len(train_dataloader),
                    "epoch": epoch,
                    "step": completed_steps,
                },
                step=completed_steps,
            )

        if args.push_to_hub and epoch < args.num_train_epochs - 1:
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(
                args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
            )
            if accelerator.is_main_process:
                tokenizer.save_pretrained(args.output_dir)
                api.upload_folder(
                    repo_id=repo_id,
                    folder_path=args.output_dir,
                    commit_message=f"Training in progress epoch {epoch}",
                    run_as_future=True,
                )

        if args.checkpointing_steps == "epoch":
            output_dir = f"epoch_{epoch}"
            if args.output_dir is not None:
                output_dir = os.path.join(args.output_dir, output_dir)
            accelerator.save_state(output_dir)

    # this is causing some issue with Megatron-LM when using `wandb` at the end of the main function.
    # Everything works fine inspite of commenting this out. (wandb finishes/closes the run without error)
    # if args.with_tracking:
    #     accelerator.end_training()

    if args.output_dir is not None:
        accelerator.wait_for_everyone()
        # New Code
        # For Megatron-LM, we need to save the model using `accelerator.save_state`
        if accelerator.distributed_type == DistributedType.MEGATRON_LM:
            accelerator.save_state(args.output_dir)
        else:
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(
                args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
            )
        if accelerator.is_main_process:
            tokenizer.save_pretrained(args.output_dir)
            if args.push_to_hub:
                api.upload_folder(
                    repo_id=repo_id,
                    folder_path=args.output_dir,
                    commit_message="End of training",
                )

        with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
            json.dump({"perplexity": perplexity}, f)
    accelerator.end_training()


if __name__ == "__main__":
    main()


================================================
FILE: examples/by_feature/memory.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os

# New Code #
import evaluate
import torch
from datasets import load_dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed

from accelerate import Accelerator, DistributedType
from accelerate.utils import find_executable_batch_size


########################################################################
# This is a fully working simple example to use Accelerate,
# specifically showcasing how to ensure out-of-memory errors never
# interrupt training, and builds off the `nlp_example.py` script.
#
# This example trains a Bert base model on GLUE MRPC
# in any of the following settings (with the same script):
#   - single CPU or single GPU
#   - multi GPUS (using PyTorch distributed mode)
#   - (multi) TPUs
#   - fp16 (mixed-precision) or fp32 (normal precision)
#
# New additions from the base script can be found quickly by
# looking for the # New Code # tags
#
# To run it in each of these various modes, follow the instructions
# in the readme for examples:
# https://github.com/huggingface/accelerate/tree/main/examples
#
########################################################################


MAX_GPU_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 32


def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):
    """
    Creates a set of `DataLoader`s for the `glue` dataset,
    using "bert-base-cased" as the tokenizer.

    Args:
        accelerator (`Accelerator`):
            An `Accelerator` object
        batch_size (`int`, *optional*):
            The batch size for the train and validation DataLoaders.
    """
    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    datasets = load_dataset("glue", "mrpc")

    def tokenize_function(examples):
        # max_length=None => use the model max length (it's actually the default)
        outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
        return outputs

    # Apply the method we just defined to all the examples in all the splits of the dataset
    # starting with the main process first:
    with accelerator.main_process_first():
        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            remove_columns=["idx", "sentence1", "sentence2"],
        )

    # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
    # transformers library
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

    def collate_fn(examples):
        # On TPU it's best to pad everything to the same length or training will be very slow.
        max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
        # When using mixed precision we want round multiples of 8/16
        if accelerator.mixed_precision == "fp8":
            pad_to_multiple_of = 16
        elif accelerator.mixed_precision != "no":
            pad_to_multiple_of = 8
        else:
            pad_to_multiple_of = None

        return tokenizer.pad(
            examples,
            padding="longest",
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors="pt",
        )

    # Instantiate dataloaders.
    train_dataloader = DataLoader(
        tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size
    )
    eval_dataloader = DataLoader(
        tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
    )

    return train_dataloader, eval_dataloader


# For testing only
if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
    from accelerate.test_utils.training import mocked_dataloaders

    get_dataloaders = mocked_dataloaders  # noqa: F811


def training_function(config, args):
    # For testing only
    if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
        config["num_epochs"] = 2
    # Initialize accelerator
    accelerator = Accelerator(cpu=args.cpu, mixed_precision=args.mixed_precision)
    # Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
    lr = config["lr"]
    num_epochs = int(config["num_epochs"])
    seed = int(config["seed"])
    batch_size = int(config["batch_size"])

    metric = evaluate.load("glue", "mrpc")

    # New Code #
    # We now can define an inner training loop function. It should take a batch size as the only parameter,
    # and build the dataloaders in there.
    # It also gets our decorator
    @find_executable_batch_size(starting_batch_size=batch_size)
    def inner_training_loop(batch_size):
        # And now just move everything below under this function
        # We need to bring in the Accelerator object from earlier
        nonlocal accelerator
        # And reset all of its attributes that could hold onto any memory:
        accelerator.free_memory()

        # Then we can declare the model, optimizer, and everything else:
        set_seed(seed)

        # Instantiate the model (we build the model here so that the seed also control new weights initialization)
        model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)

        # We could avoid this line since the accelerator is set with `device_placement=True` (default value).
        # Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
        # creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
        model = model.to(accelerator.device)

        # Instantiate optimizer
        optimizer = AdamW(params=model.parameters(), lr=lr)
        train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)

        # Instantiate scheduler
        lr_scheduler = get_linear_schedule_with_warmup(
            optimizer=optimizer,
            num_warmup_steps=100,
            num_training_steps=(len(train_dataloader) * num_epochs),
        )

        # Prepare everything
        # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
        # prepare method.
        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
            model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
        )

        # Now we train the model
        for epoch in range(num_epochs):
            model.train()
            for step, batch in enumerate(train_dataloader):
                # We could avoid this line since we set the accelerator with `device_placement=True`.
                batch.to(accelerator.device)
                outputs = model(**batch)
                loss = outputs.loss
                accelerator.backward(loss)
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()

            model.eval()
            for step, batch in enumerate(eval_dataloader):
                # We could avoid this line since we set the accelerator with `device_placement=True`.
                batch.to(accelerator.device)
                with torch.no_grad():
                    outputs = model(**batch)
                predictions = outputs.logits.argmax(dim=-1)
                predictions, references = accelerator.gather_for_metrics((predictions, batch["labels"]))
                metric.add_batch(
                    predictions=predictions,
                    references=references,
                )

            eval_metric = metric.compute()
            # Use accelerator.print to print only on the main process.
            accelerator.print(f"epoch {epoch}:", eval_metric)

    # New Code #
    # And call it at the end with no arguments
    # Note: You could also refactor this outside of your training loop function
    inner_training_loop()
    accelerator.end_training()


def main():
    parser = argparse.ArgumentParser(description="Simple example of training script.")
    parser.add_argument(
        "--mixed_precision",
        type=str,
        default=None,
        choices=["no", "fp16", "bf16", "fp8"],
        help="Whether to use mixed precision. Choose"
        "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
        "and an Nvidia Ampere GPU.",
    )
    parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
    args = parser.parse_args()
    config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16}
    training_function(config, args)


if __name__ == "__main__":
    main()


================================================
FILE: examples/by_feature/multi_process_metrics.py
================================================
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os

import evaluate
import torch
from datasets import load_dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed

from accelerate import Accelerator, DistributedType


########################################################################
# This is a fully working simple example to use Accelerate,
# specifically showcasing how to properly calculate the metrics on the
# validation dataset when in a distributed system, and builds off the
# `nlp_example.py` script.
#
# This example trains a Bert base model on GLUE MRPC
# in any of the following settings (with the same script):
#   - single CPU or single GPU
#   - multi GPUS (using PyTorch distributed mode)
#   - (multi) TPUs
#   - fp16 (mixed-precision) or fp32 (normal precision)
#
# To help focus on the differences in the code, building `DataLoaders`
# was refactored into its own function.
# New additions from the base script can be found quickly by
# looking for the # New Code # tags
#
# To run it in each of these various modes, follow the instructions
# in the readme for examples:
# https://github.com/huggingface/accelerate/tree/main/examples
#
########################################################################


MAX_GPU_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 32


def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):
    """
    Creates a set of `DataLoader`s for the `glue` dataset,
    using "bert-base-cased" as the tokenizer.

    Args:
        accelerator (`Accelerator`):
            An `Accelerator` object
        batch_size (`int`, *optional*):
            The batch size for the train and validation DataLoaders.
    """
    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    datasets = load_dataset("glue", "mrpc")

    def tokenize_function(examples):
        # max_length=None => use the model max length (it's actually the default)
        outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
        return outputs

    # Apply the method we just defined to all the examples in all the splits of the dataset
    # starting with the main process first:
    with accelerator.main_process_first():
        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            remove_columns=["idx", "sentence1", "sentence2"],
        )

    # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
    # transformers library
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

    def collate_fn(examples):
        # On TPU it's best to pad everything to the same length or training will be very slow.
        max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
        # When using mixed precision we want round multiples of 8/16
        if accelerator.mixed_precision == "fp8":
            pad_to_multiple_of = 16
        elif accelerator.mixed_precision != "no":
            pad_to_multiple_of = 8
        else:
            pad_to_multiple_of = None

        return tokenizer.pad(
            examples,
            padding="longest",
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors="pt",
        )

    # Instantiate dataloaders.
    train_dataloader = DataLoader(
        tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size
    )
    eval_dataloader = DataLoader(
        tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
    )

    return train_dataloader, eval_dataloader


# For testing only
if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
    from accelerate.test_utils.training import mocked_dataloaders

    get_dataloaders = mocked_dataloaders  # noqa: F811


def training_function(config, args):
    # For testing only
    if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
        config["num_epochs"] = 2
    # Initialize accelerator
    accelerator = Accelerator(cpu=args.cpu, mixed_precision=args.mixed_precision)
    # Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
    lr = config["lr"]
    num_epochs = int(config["num_epochs"])
    seed = int(config["seed"])
    batch_size = int(config["batch_size"])

    metric = evaluate.load("glue", "mrpc")

    # If the batch size is too big we use gradient accumulation
    gradient_accumulation_steps = 1
    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.XLA:
        gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
        batch_size = MAX_GPU_BATCH_SIZE

    set_seed(seed)
    train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)
    # Instantiate the model (we build the model here so that the seed also control new weights initialization)
    model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)

    # We could avoid this line since the accelerator is set with `device_placement=True` (default value).
    # Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
    # creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
    model = model.to(accelerator.device)

    # Instantiate optimizer
    optimizer = AdamW(params=model.parameters(), lr=lr)

    # Instantiate scheduler
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=100,
        num_training_steps=(len(train_dataloader) * num_epochs) // gradient_accumulation_steps,
    )

    # Prepare everything
    # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
    # prepare method.
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
    )

    # Now we train the model
    for epoch in range(num_epochs):
        model.train()
        for step, batch in enumerate(train_dataloader):
            # We could avoid this line since we set the accelerator with `device_placement=True`.
            batch.to(accelerator.device)
            outputs = model(**batch)
            loss = outputs.loss
            loss = loss / gradient_accumulation_steps
            accelerator.backward(loss)
            if step % gradient_accumulation_steps == 0:
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()

        model.eval()
        samples_seen = 0
        for step, batch in enumerate(eval_dataloader):
            # We could avoid this line since we set the accelerator with `device_placement=True`.
            batch.to(accelerator.device)
            with torch.no_grad():
                outputs = model(**batch)
            predictions = outputs.logits.argmax(dim=-1)
            predictions, references = accelerator.gather((predictions, batch["labels"]))
            # New Code #
            # First we check if it's a distributed system
            if accelerator.use_distributed:
                # Then see if we're on the last batch of our eval dataloader
                if step == len(eval_dataloader) - 1:
                    # Last batch needs to be truncated on distributed systems as it contains additional samples
                    predictions = predictions[: len(eval_dataloader.dataset) - samples_seen]
                    references = references[: len(eval_dataloader.dataset) - samples_seen]
                else:
                    # Otherwise we add the number of samples seen
                    samples_seen += references.shape[0]
            # All of this can be avoided if you use `Accelerator.gather_for_metrics` instead of `Accelerator.gather`:
            # accelerator.gather_for_metrics((predictions, batch["labels"]))
            metric.add_batch(
                predictions=predictions,
                references=references,
            )

        eval_metric = metric.compute()
        # Use accelerator.print to print only on the main process.
        accelerator.print(f"epoch {epoch}:", eval_metric)
    accelerator.end_training()


def main():
    parser = argparse.ArgumentParser(description="Simple example of training script.")
    parser.add_argument(
        "--mixed_precision",
        type=str,
        default=None,
        choices=["no", "fp16", "bf16", "fp8"],
        help="Whether to use mixed precision. Choose"
        "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
        "and an Nvidia Ampere GPU.",
    )
    parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
    args = parser.parse_args()
    config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16}
    training_function(config, args)


if __name__ == "__main__":
    main()


================================================
FILE: examples/by_feature/profiler.py
================================================
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os

import evaluate
import torch
from datasets import load_dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed

from accelerate import Accelerator, DistributedType
from accelerate.utils import ProfileKwargs


########################################################################
# This is a fully working simple example to use Accelerate
# and perform profiling
#
# This example trains a Bert base model on GLUE MRPC
# in any of the following settings (with the same script):
#   - single CPU or single device (CUDA GPU, Intel XPU etc.)
#   - multi devices (using PyTorch distributed mode)
#   - (multi) TPUs
#   - fp16 (mixed-precision) or fp32 (normal precision)
#
# To run it in each of these various modes, follow the instructions
# in the readme for examples:
# https://github.com/huggingface/accelerate/tree/main/examples
#
########################################################################


MAX_GPU_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 32


def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):
    """
    Creates a set of `DataLoader`s for the `glue` dataset,
    using "bert-base-cased" as the tokenizer.

    Args:
        accelerator (`Accelerator`):
            An `Accelerator` object
        batch_size (`int`, *optional*):
            The batch size for the train and validation DataLoaders.
    """
    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    datasets = load_dataset("glue", "mrpc")

    def tokenize_function(examples):
        # max_length=None => use the model max length (it's actually the default)
        outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
        return outputs

    # Apply the method we just defined to all the examples in all the splits of the dataset
    # starting with the main process first:
    with accelerator.main_process_first():
        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            remove_columns=["idx", "sentence1", "sentence2"],
        )

    # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
    # transformers library
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

    def collate_fn(examples):
        # On TPU it's best to pad everything to the same length or training will be very slow.
        max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
        # When using mixed precision we want round multiples of 8/16
        if accelerator.mixed_precision == "fp8":
            pad_to_multiple_of = 16
        elif accelerator.mixed_precision != "no":
            pad_to_multiple_of = 8
        else:
            pad_to_multiple_of = None

        return tokenizer.pad(
            examples,
            padding="longest",
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors="pt",
        )

    # Instantiate dataloaders.
    train_dataloader = DataLoader(
        tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size
    )
    eval_dataloader = DataLoader(
        tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
    )

    return train_dataloader, eval_dataloader


# For testing only
if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
    from accelerate.test_utils.training import mocked_dataloaders

    get_dataloaders = mocked_dataloaders  # noqa: F811


def training_function(config, args):
    # For testing only
    if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
        config["num_epochs"] = 2
    # New Code #
    profile_kwargs = ProfileKwargs(
        record_shapes=args.record_shapes,
        profile_memory=args.profile_memory,
        with_flops=args.with_flops,
        output_trace_dir=args.output_trace_dir,
    )
    # Initialize accelerator
    accelerator = Accelerator(cpu=args.cpu, mixed_precision=args.mixed_precision, kwargs_handlers=[profile_kwargs])
    # Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
    lr = config["lr"]
    num_epochs = int(config["num_epochs"])
    seed = int(config["seed"])
    batch_size = int(config["batch_size"])

    metric = evaluate.load("glue", "mrpc")

    set_seed(seed)
    train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)
    # Instantiate the model (we build the model here so that the seed also control new weights initialization)
    model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)

    # We could avoid this line since the accelerator is set with `device_placement=True` (default value).
    # Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
    # creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
    model = model.to(accelerator.device)

    # Instantiate optimizer
    optimizer = AdamW(params=model.parameters(), lr=lr)

    # Instantiate scheduler
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=100,
        num_training_steps=(len(train_dataloader) * num_epochs),
    )

    # Prepare everything
    # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
    # prepare method.
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
    )

    # Now we train the model
    for epoch in range(num_epochs):
        model.train()
        # New Code #
        with accelerator.profile() as prof:
            for step, batch in enumerate(train_dataloader):
                # We could avoid this line since we set the accelerator with `device_placement=True`.
                batch.to(accelerator.device)
                # We use the new `accumulate` context manager to perform gradient accumulation
                with accelerator.accumulate(model):
                    output = model(**batch)
                    loss = output.loss
                    accelerator.backward(loss)
                    optimizer.step()
                    lr_scheduler.step()
                    optimizer.zero_grad()
        # New Code #
        accelerator.print(
            prof.key_averages().table(
                sort_by="self_cpu_time_total" if args.cpu else f"self_{accelerator.device.type}_time_total",
                row_limit=-1,
            )
        )

        model.eval()
        for step, batch in enumerate(eval_dataloader):
            # We could avoid this line since we set the accelerator with `device_placement=True`.
            batch.to(accelerator.device)
            with torch.no_grad():
                outputs = model(**batch)
            predictions = outputs.logits.argmax(dim=-1)
            predictions, references = accelerator.gather_for_metrics((predictions, batch["labels"]))
            metric.add_batch(
                predictions=predictions,
                references=references,
            )

        eval_metric = metric.compute()
        # Use accelerator.print to print only on the main process.
        accelerator.print(f"epoch {epoch}:", eval_metric)
    accelerator.end_training()


def main():
    parser = argparse.ArgumentParser(description="Simple example of training script.")
    parser.add_argument(
        "--mixed_precision",
        type=str,
        default=None,
        choices=["no", "fp16", "bf16", "fp8"],
        help="Whether to use mixed precision. Choose"
        "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
        "and an Nvidia Ampere GPU or an Intel XPU.",
    )
    # New Code #
    parser.add_argument(
        "--record_shapes",
        action="store_true",
        default=False,
        help="If passed, will record shapes for profiling.",
    )
    # New Code #
    parser.add_argument(
        "--profile_memory",
        action="store_true",
        default=False,
        help="If passed, will profile memory.",
    )
    # New Code #
    parser.add_argument(
        "--with_flops",
        action="store_true",
        default=False,
        help="If passed, will profile flops.",
    )
    # New Code #
    parser.add_argument(
        "--output_trace_dir",
        type=str,
        default=None,
        help="If passed, will save a json trace to the specified path.",
    )
    parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
    args = parser.parse_args()
    config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16}
    training_function(config, args)


if __name__ == "__main__":
    main()


================================================
FILE: examples/by_feature/schedule_free.py
================================================
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os

import evaluate
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, set_seed

from accelerate import Accelerator, DistributedType
from accelerate.utils import is_schedulefree_available


if is_schedulefree_available():
    import schedulefree
else:
    raise ImportError(
        "This example requires the `schedulefree` library. Please install it with `pip install schedulefree`"
    )


########################################################################
# This is a fully working simple example to use Accelerate and Facebook's
# scheduler-free optimizer: https://github.com/facebookresearch/schedule_free/
#
# This example trains a Bert base model on GLUE MRPC
# in any of the following settings (with the same script):
#   - single CPU or single GPU
#   - multi GPUS (using PyTorch distributed mode)
#   - (multi) TPUs
#   - fp16 (mixed-precision) or fp32 (normal precision)
#
# To run it in each of these various modes, follow the instructions
# in the readme for examples:
# https://github.com/huggingface/accelerate/tree/main/examples
#
########################################################################


MAX_GPU_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 32


def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):
    """
    Creates a set of `DataLoader`s for the `glue` dataset,
    using "bert-base-cased" as the tokenizer.

    Args:
        accelerator (`Accelerator`):
            An `Accelerator` object
        batch_size (`int`, *optional*):
            The batch size for the train and validation DataLoaders.
    """
    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    datasets = load_dataset("glue", "mrpc")

    def tokenize_function(examples):
        # max_length=None => use the model max length (it's actually the default)
        outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
        return outputs

    # Apply the method we just defined to all the examples in all the splits of the dataset
    # starting with the main process first:
    with accelerator.main_process_first():
        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            remove_columns=["idx", "sentence1", "sentence2"],
        )

    # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
    # transformers library
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

    def collate_fn(examples):
        # For Torchxla, it's best to pad everything to the same length or training will be very slow.
        max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
        # When using mixed precision we want round multiples of 8/16
        if accelerator.mixed_precision == "fp8":
            pad_to_multiple_of = 16
        elif accelerator.mixed_precision != "no":
            pad_to_multiple_of = 8
        else:
            pad_to_multiple_of = None

        return tokenizer.pad(
            examples,
            padding="longest",
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors="pt",
        )

    # Instantiate dataloaders.
    train_dataloader = DataLoader(
        tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size, drop_last=True
    )
    eval_dataloader = DataLoader(
        tokenized_datasets["validation"],
        shuffle=False,
        collate_fn=collate_fn,
        batch_size=EVAL_BATCH_SIZE,
        drop_last=(accelerator.mixed_precision == "fp8"),
    )

    return train_dataloader, eval_dataloader


# For testing only


if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
    from accelerate.test_utils.training import mocked_dataloaders

    get_dataloaders = mocked_dataloaders  # noqa: F811


def training_function(config, args):
    # Initialize accelerator
    accelerator = Accelerator(cpu=args.cpu, mixed_precision=args.mixed_precision)
    # Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
    lr = config["lr"]
    num_epochs = int(config["num_epochs"])
    seed = int(config["seed"])
    batch_size = int(config["batch_size"])

    metric = evaluate.load("glue", "mrpc")

    # If the batch size is too big we use gradient accumulation
    gradient_accumulation_steps = 1
    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.XLA:
        gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
        batch_size = MAX_GPU_BATCH_SIZE

    set_seed(seed)
    train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)
    # Instantiate the model (we build the model here so that the seed also control new weights initialization)
    model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)

    # We could avoid this line since the accelerator is set with `device_placement=True` (default value).
    # Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
    # creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
    model = model.to(accelerator.device)
    # Instantiate optimizer with warmup steps
    optimizer = schedulefree.AdamWScheduleFree(
        model.parameters(),
        lr=lr,
        warmup_steps=100,
    )

    # Prepare everything
    # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
    # prepare method.

    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader
    )

    # Now we train the model
    for epoch in range(num_epochs):
        model.train()
        optimizer.train()
        for step, batch in enumerate(train_dataloader):
            # We could avoid this line since we set the accelerator with `device_placement=True`.
            batch.to(accelerator.device)
            outputs = model(**batch)
            loss = outputs.loss
            loss = loss / gradient_accumulation_steps
            accelerator.backward(loss)
            if step % gradient_accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()

        model.eval()
        optimizer.eval()
        for step, batch in enumerate(eval_dataloader):
            # We could avoid this line since we set the accelerator with `device_placement=True`.
            batch.to(accelerator.device)
            with torch.no_grad():
                outputs = model(**batch)
            predictions = outputs.logits.argmax(dim=-1)
            predictions, references = accelerator.gather_for_metrics((predictions, batch["labels"]))
            metric.add_batch(
                predictions=predictions,
                references=references,
            )

        eval_metric = metric.compute()
        # Use accelerator.print to print only on the main process.
        accelerator.print(f"epoch {epoch}:", eval_metric)
    accelerator.end_training()


def main():
    parser = argparse.ArgumentParser(description="Simple example of training script.")
    parser.add_argument(
        "--mixed_precision",
        type=str,
        default=None,
        choices=["no", "fp16", "bf16", "fp8"],
        help="Whether to use mixed precision. Choose"
        "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
        "and an Nvidia Ampere GPU.",
    )
    parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
    args = parser.parse_args()
    config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16}
    training_function(config, args)


if __name__ == "__main__":
    main()


================================================
FILE: examples/by_feature/tracking.py
================================================
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os

import evaluate
import torch
from datasets import load_dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed

from accelerate import Accelerator, DistributedType


########################################################################
# This is a fully working simple example to use Accelerate,
# specifically showcasing the experiment tracking capability,
# and builds off the `nlp_example.py` script.
#
# This example trains a Bert base model on GLUE MRPC
# in any of the following settings (with the same script):
#   - single CPU or single GPU
#   - multi GPUS (using PyTorch distributed mode)
#   - (multi) TPUs
#   - fp16 (mixed-precision) or fp32 (normal precision)
#
# To help focus on the differences in the code, building `DataLoaders`
# was refactored into its own function.
# New additions from the base script can be found quickly by
# looking for the # New Code # tags
#
# To run it in each of these various modes, follow the instructions
# in the readme for examples:
# https://github.com/huggingface/accelerate/tree/main/examples
#
########################################################################

MAX_GPU_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 32


def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):
    """
    Creates a set of `DataLoader`s for the `glue` dataset,
    using "bert-base-cased" as the tokenizer.

    Args:
        accelerator (`Accelerator`):
            An `Accelerator` object
        batch_size (`int`, *optional*):
            The batch size for the train and validation DataLoaders.
    """
    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    datasets = load_dataset("glue", "mrpc")

    def tokenize_function(examples):
        # max_length=None => use the model max length (it's actually the default)
        outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
        return outputs

    # Apply the method we just defined to all the examples in all the splits of the dataset
    # starting with the main process first:
    with accelerator.main_process_first():
        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            remove_columns=["idx", "sentence1", "sentence2"],
        )

    # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
    # transformers library
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

    def collate_fn(examples):
        # On TPU it's best to pad everything to the same length or training will be very slow.
        max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
        # When using mixed precision we want round multiples of 8/16
        if accelerator.mixed_precision == "fp8":
            pad_to_multiple_of = 16
        elif accelerator.mixed_precision != "no":
            pad_to_multiple_of = 8
        else:
            pad_to_multiple_of = None

        return tokenizer.pad(
            examples,
            padding="longest",
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors="pt",
        )

    # Instantiate dataloaders.
    train_dataloader = DataLoader(
        tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size
    )
    eval_dataloader = DataLoader(
        tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
    )

    return train_dataloader, eval_dataloader


# For testing only
if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
    from accelerate.test_utils.training import mocked_dataloaders

    get_dataloaders = mocked_dataloaders  # noqa: F811


def training_function(config, args):
    # For testing only
    if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
        config["num_epochs"] = 2
    # Initialize Accelerator

    # New Code #
    # We pass in "all" to `log_with` to grab all available trackers in the environment
    # Note: If using a custom `Tracker` class, should be passed in here such as:
    # >>> log_with = ["all", MyCustomTrackerClassInstance()]
    if args.with_tracking:
        accelerator = Accelerator(
            cpu=args.cpu, mixed_precision=args.mixed_precision, log_with="all", project_dir=args.project_dir
        )
    else:
        accelerator = Accelerator(cpu=args.cpu, mixed_precision=args.mixed_precision)
    # Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
    lr = config["lr"]
    num_epochs = int(config["num_epochs"])
    seed = int(config["seed"])
    batch_size = int(config["batch_size"])
    set_seed(seed)

    train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)
    metric = evaluate.load("glue", "mrpc")

    # If the batch size is too big we use gradient accumulation
    gradient_accumulation_steps = 1
    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.XLA:
        gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
        batch_size = MAX_GPU_BATCH_SIZE

    # Instantiate the model (we build the model here so that the seed also control new weights initialization)
    model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)

    # We could avoid this line since the accelerator is set with `device_placement=True` (default value).
    # Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
    # creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
    model = model.to(accelerator.device)

    # Instantiate optimizer
    optimizer = AdamW(params=model.parameters(), lr=lr)

    # Instantiate scheduler
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=100,
        num_training_steps=(len(train_dataloader) * num_epochs) // gradient_accumulation_steps,
    )

    # Prepare everything
    # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
    # prepare method.
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
    )

    # New Code #
    # We need to initialize the trackers we use. Overall configurations can also be stored
    if args.with_tracking:
        run = os.path.split(__file__)[-1].split(".")[0]
        accelerator.init_trackers(run, config)

    # Now we train the model
    for epoch in range(num_epochs):
        model.train()
        # New Code #
        # For our tracking example, we will log the total loss of each epoch
        if args.with_tracking:
            total_loss = 0
        for step, batch in enumerate(train_dataloader):
            # We could avoid this line since we set the accelerator with `device_placement=True`.
            batch.to(accelerator.device)
            outputs = model(**batch)
            loss = outputs.loss
            # New Code #
            if args.with_tracking:
                total_loss += loss.detach().float()
            loss = loss / gradient_accumulation_steps
            accelerator.backward(loss)
            if step % gradient_accumulation_steps == 0:
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()

        model.eval()
        for step, batch in enumerate(eval_dataloader):
            # We could avoid this line since we set the accelerator with `device_placement=True` (the default).
            batch.to(accelerator.device)
            with torch.no_grad():
                outputs = model(**batch)
            predictions = outputs.logits.argmax(dim=-1)
            predictions, references = accelerator.gather_for_metrics((predictions, batch["labels"]))
            metric.add_batch(
                predictions=predictions,
                references=references,
            )

        eval_metric = metric.compute()
        # Use accelerator.print to print only on the main process.
        accelerator.print(f"epoch {epoch}:", eval_metric)

        # New Code #
        # To actually log, we call `Accelerator.log`
        # The values passed can be of `str`, `int`, `float` or `dict` of `str` to `float`/`int`
        if args.with_tracking:
            accelerator.log(
                {
                    "accuracy": eval_metric["accuracy"],
                    "f1": eval_metric["f1"],
                    "train_loss": total_loss.item() / len(train_dataloader),
                    "epoch": epoch,
                },
                step=epoch,
            )

    accelerator.end_training()


def main():
    parser = argparse.ArgumentParser(description="Simple example of training script.")
    parser.add_argument(
        "--mixed_precision",
        type=str,
        default=None,
        choices=["no", "fp16", "bf16", "fp8"],
        help="Whether to use mixed precision. Choose"
        "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
        "and an Nvidia Ampere GPU.",
    )
    parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
    parser.add_argument(
        "--with_tracking",
        action="store_true",
        help="Whether to load in all available experiment trackers from the environment and use them for logging.",
    )
    parser.add_argument(
        "--project_dir",
        type=str,
        default="logs",
        help="Location on where to store experiment tracking logs` and relevent project information",
    )
    args = parser.parse_args()
    config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16}
    training_function(config, args)


if __name__ == "__main__":
    main()


================================================
FILE: examples/complete_cv_example.py
================================================
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import re

import numpy as np
import PIL
import torch
from timm import create_model
from torch.optim.lr_scheduler import OneCycleLR
from torch.utils.data import DataLoader, Dataset
from torchvision.transforms import Compose, RandomResizedCrop, Resize, ToTensor

from accelerate import Accelerator, DataLoaderConfiguration
from accelerate.utils import is_xpu_available


########################################################################
# This is a fully working simple example to use Accelerate
#
# This example trains a ResNet50 on the Oxford-IIT Pet Dataset
# in any of the following settings (with the same script):
#   - single CPU or single GPU
#   - multi GPUS (using PyTorch distributed mode)
#   - (multi) TPUs
#   - fp16 (mixed-precision) or fp32 (normal precision)
#
# To run it in each of these various modes, follow the instructions
# in the readme for examples:
# https://github.com/huggingface/accelerate/tree/main/examples
#
########################################################################


# Function to get the label from the filename
def extract_label(fname):
    stem = fname.split(os.path.sep)[-1]
    return re.search(r"^(.*)_\d+\.jpg$", stem).groups()[0]


class PetsDataset(Dataset):
    def __init__(self, file_names, image_transform=None, label_to_id=None):
        self.file_names = file_names
        self.image_transform = image_transform
        self.label_to_id = label_to_id

    def __len__(self):
        return len(self.file_names)

    def __getitem__(self, idx):
        fname = self.file_names[idx]
        raw_image = PIL.Image.open(fname)
        image = raw_image.convert("RGB")
        if self.image_transform is not None:
            image = self.image_transform(image)
        label = extract_label(fname)
        if self.label_to_id is not None:
            label = self.label_to_id[label]
        return {"image": image, "label": label}


def training_function(config, args):
    # Initialize accelerator
    dataloader_config = DataLoaderConfiguration(use_stateful_dataloader=args.use_stateful_dataloader)
    if args.with_tracking:
        accelerator = Accelerator(
            cpu=args.cpu,
            mixed_precision=args.mixed_precision,
            log_with="all",
            project_dir=args.project_dir,
            dataloader_config=dataloader_config,
        )
    else:
        accelerator = Accelerator(
            cpu=args.cpu, mixed_precision=args.mixed_precision, dataloader_config=dataloader_config
        )

    # Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
    lr = config["lr"]
    num_epochs = int(config["num_epochs"])
    seed = int(config["seed"])
    batch_size = int(config["batch_size"])
    image_size = config["image_size"]
    if not isinstance(image_size, (list, tuple)):
        image_size = (image_size, image_size)

    # Parse out whether we are saving every epoch or after a certain number of batches
    if hasattr(args.checkpointing_steps, "isdigit"):
        if args.checkpointing_steps == "epoch":
            checkpointing_steps = args.checkpointing_steps
        elif args.checkpointing_steps.isdigit():
            checkpointing_steps = int(args.checkpointing_steps)
        else:
            raise ValueError(
                f"Argument `checkpointing_steps` must be either a number or `epoch`. `{args.checkpointing_steps}` passed."
            )
    else:
        checkpointing_steps = None

    # We need to initialize the trackers we use, and also store our configuration
    if args.with_tracking:
        run = os.path.split(__file__)[-1].split(".")[0]
        accelerator.init_trackers(run, config)

    # Grab all the image filenames
    file_names = [os.path.join(args.data_dir, fname) for fname in os.listdir(args.data_dir) if fname.endswith(".jpg")]

    # Build the label correspondences
    all_labels = [extract_label(fname) for fname in file_names]
    id_to_label = list(set(all_labels))
    id_to_label.sort()
    label_to_id = {lbl: i for i, lbl in enumerate(id_to_label)}

    # Set the seed before splitting the data.
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    elif is_xpu_available():
        torch.xpu.manual_seed_all(seed)

    # Split our filenames between train and validation
    random_perm = np.random.permutation(len(file_names))
    cut = int(0.8 * len(file_names))
    train_split = random_perm[:cut]
    eval_split = random_perm[cut:]

    # For training we use a simple RandomResizedCrop
    train_tfm = Compose([RandomResizedCrop(image_size, scale=(0.5, 1.0)), ToTensor()])
    train_dataset = PetsDataset(
        [file_names[i] for i in train_split], image_transform=train_tfm, label_to_id=label_to_id
    )

    # For evaluation, we use a deterministic Resize
    eval_tfm = Compose([Resize(image_size), ToTensor()])
    eval_dataset = PetsDataset([file_names[i] for i in eval_split], image_transform=eval_tfm, label_to_id=label_to_id)

    # Instantiate dataloaders.
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, num_workers=4)
    eval_dataloader = DataLoader(eval_dataset, shuffle=False, batch_size=batch_size, num_workers=4)

    # Instantiate the model (we build the model here so that the seed also control new weights initialization)
    model = create_model("resnet50d", pretrained=True, num_classes=len(label_to_id))

    # We could avoid this line since the accelerator is set with `device_placement=True` (default value).
    # Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
    # creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
    model = model.to(accelerator.device)

    # Freezing the base model
    for param in model.parameters():
        param.requires_grad = False
    for param in model.get_classifier().parameters():
        param.requires_grad = True

    # We normalize the batches of images to be a bit faster.
    mean = torch.tensor(model.default_cfg["mean"])[None, :, None, None].to(accelerator.device)
    std = torch.tensor(model.default_cfg["std"])[None, :, None, None].to(accelerator.device)

    # Instantiate optimizer
    optimizer = torch.optim.Adam(params=model.parameters(), lr=lr / 25)

    # Instantiate learning rate scheduler
    lr_scheduler = OneCycleLR(optimizer=optimizer, max_lr=lr, epochs=num_epochs, steps_per_epoch=len(train_dataloader))

    # Prepare everything
    # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
    # prepare method.
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
    )
    # We need to keep track of how many total steps we have iterated over
    overall_step = 0
    # We also need to keep track of the starting epoch so files are named properly
    starting_epoch = 0

    # Potentially load in the weights and states from a previous save
    if args.resume_from_checkpoint:
        if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
            accelerator.load_state(args.resume_from_checkpoint)
            path = os.path.basename(args.resume_from_checkpoint)
        else:
            # Get the most recent checkpoint
            dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
            dirs.sort(key=os.path.getctime)
            path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
        # Extract `epoch_{i}` or `step_{i}`
        training_difference = os.path.splitext(path)[0]

        if "epoch" in training_difference:
            starting_epoch = int(training_difference.replace("epoch_", "")) + 1
            resume_step = None
        else:
            resume_step = int(training_difference.replace("step_", ""))
            starting_epoch = resume_step // len(train_dataloader)
            resume_step -= starting_epoch * len(train_dataloader)

    # Now we train the model
    for epoch in range(starting_epoch, num_epochs):
        model.train()
        if args.with_tracking:
            total_loss = 0
        if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
            # We need to skip steps until we reach the resumed step
            active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
            overall_step += resume_step
        else:
            # After the first iteration though, we need to go back to the original dataloader
            active_dataloader = train_dataloader
        for batch in active_dataloader:
            # We could avoid this line since we set the accelerator with `device_placement=True`.
            batch = {k: v.to(accelerator.device) for k, v in batch.items()}
            inputs = (batch["image"] - mean) / std
            outputs = model(inputs)
            loss = torch.nn.functional.cross_entropy(outputs, batch["label"])
            # We keep track of the loss at each epoch
            if args.with_tracking:
                total_loss += loss.detach().float()
            accelerator.backward(loss)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            overall_step += 1
            if isinstance(checkpointing_steps, int):
                output_dir = f"step_{overall_step}"
                if overall_step % checkpointing_steps == 0:
                    if args.output_dir is not None:
                        output_dir = os.path.join(args.output_dir, output_dir)
                    accelerator.save_state(output_dir)
        model.eval()
        accurate = 0
        num_elems = 0
        for step, batch in enumerate(eval_dataloader):
            # We could avoid this line since we set the accelerator with `device_placement=True`.
            batch = {k: v.to(accelerator.device) for k, v in batch.items()}
            inputs = (batch["image"] - mean) / std
            with torch.no_grad():
                outputs = model(inputs)
            predictions = outputs.argmax(dim=-1)
            predictions, references = accelerator.gather_for_metrics((predictions, batch["label"]))
            accurate_preds = predictions == references
            num_elems += accurate_preds.shape[0]
            accurate += accurate_preds.long().sum()

        eval_metric = accurate.item() / num_elems
        # Use accelerator.print to print only on the main process.
        accelerator.print(f"epoch {epoch}: {100 * eval_metric:.2f}")
        if args.with_tracking:
            accelerator.log(
                {
                    "accuracy": 100 * eval_metric,
                    "train_loss": total_loss.item() / len(train_dataloader),
                    "epoch": epoch,
                },
                step=overall_step,
            )
        if checkpointing_steps == "epoch":
            output_dir = f"epoch_{epoch}"
            if args.output_dir is not None:
                output_dir = os.path.join(args.output_dir, output_dir)
            accelerator.save_state(output_dir)

    accelerator.end_training()


def main():
    parser = argparse.ArgumentParser(description="Simple example of training script.")
    parser.add_argument("--data_dir", required=True, help="The data folder on disk.")
    parser.add_argument("--fp16", action="store_true", help="If passed, will use FP16 training.")
    parser.add_argument(
        "--mixed_precision",
        type=str,
        default=None,
        choices=["no", "fp16", "bf16", "fp8"],
        help="Whether to use mixed precision. Choose"
        "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
        "and an Nvidia Ampere GPU.",
    )
    parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
    parser.add_argument(
        "--checkpointing_steps",
        type=str,
        default=None,
        help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
    )
    parser.add_argument(
        "--output_dir",
        type=str,
        default=".",
        help="Optional save directory where all checkpoint folders will be stored. Default is the current working directory.",
    )
    parser.add_argument(
        "--resume_from_checkpoint",
        type=str,
        default=None,
        help="If the training should continue from a checkpoint folder.",
    )
    parser.add_argument(
        "--use_stateful_dataloader",
        action="store_true",
        help="If the dataloader should be a resumable stateful dataloader.",
    )
    parser.add_argument(
        "--with_tracking",
        action="store_true",
        help="Whether to load in all available experiment trackers from the environment and use them for logging.",
    )
    parser.add_argument(
        "--project_dir",
        type=str,
        default="logs",
        help="Location on where to store experiment tracking logs` and relevent project information",
    )
    args = parser.parse_args()
    config = {"lr": 3e-2, "num_epochs": 3, "seed": 42, "batch_size": 64, "image_size": 224}
    training_function(config, args)


if __name__ == "__main__":
    main()


================================================
FILE: examples/complete_nlp_example.py
================================================
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os

import evaluate
import torch
from datasets import load_dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed

from accelerate import Accelerator, DataLoaderConfiguration, DistributedType


########################################################################
# This is a fully working simple example to use Accelerate
#
# This example trains a Bert base model on GLUE MRPC
# in any of the following settings (with the same script):
#   - single CPU or single GPU
#   - multi GPUS (using PyTorch distributed mode)
#   - (multi) TPUs
#   - fp16 (mixed-precision) or fp32 (normal precision)
#
# This example also demonstrates the checkpointing and sharding capabilities
#
# To run it in each of these various modes, follow the instructions
# in the readme for examples:
# https://github.com/huggingface/accelerate/tree/main/examples
#
########################################################################


MAX_GPU_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 32


def training_function(config, args):
    # Initialize accelerator
    dataloader_config = DataLoaderConfiguration(use_stateful_dataloader=args.use_stateful_dataloader)
    if args.with_tracking:
        accelerator = Accelerator(
            cpu=args.cpu,
            mixed_precision=args.mixed_precision,
            dataloader_config=dataloader_config,
            log_with="all",
            project_dir=args.project_dir,
        )
    else:
        accelerator = Accelerator(
            cpu=args.cpu, mixed_precision=args.mixed_precision, dataloader_config=dataloader_config
        )

    if hasattr(args.checkpointing_steps, "isdigit"):
        if args.checkpointing_steps == "epoch":
            checkpointing_steps = args.checkpointing_steps
        elif args.checkpointing_steps.isdigit():
            checkpointing_steps = int(args.checkpointing_steps)
        else:
            raise ValueError(
                f"Argument `checkpointing_steps` must be either a number or `epoch`. `{args.checkpointing_steps}` passed."
            )
    else:
        checkpointing_steps = None
    # Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
    lr = config["lr"]
    num_epochs = int(config["num_epochs"])
    seed = int(config["seed"])
    batch_size = int(config["batch_size"])

    # We need to initialize the trackers we use, and also store our configuration
    if args.with_tracking:
        run = os.path.split(__file__)[-1].split(".")[0]
        accelerator.init_trackers(run, config)

    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    datasets = load_dataset("glue", "mrpc")
    metric = evaluate.load("glue", "mrpc")

    def tokenize_function(examples):
        # max_length=None => use the model max length (it's actually the default)
        outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
        return outputs

    # Apply the method we just defined to all the examples in all the splits of the dataset
    # starting with the main process first:
    with accelerator.main_process_first():
        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            remove_columns=["idx", "sentence1", "sentence2"],
        )

    # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
    # transformers library
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

    # If the batch size is too big we use gradient accumulation
    gradient_accumulation_steps = 1
    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.XLA:
        gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
        batch_size = MAX_GPU_BATCH_SIZE

    def collate_fn(examples):
        # On TPU it's best to pad everything to the same length or training will be very slow.
        max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
        # When using mixed precision we want round multiples of 8/16
        if accelerator.mixed_precision == "fp8":
            pad_to_multiple_of = 16
        elif accelerator.mixed_precision != "no":
            pad_to_multiple_of = 8
        else:
            pad_to_multiple_of = None

        return tokenizer.pad(
            examples,
            padding="longest",
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors="pt",
        )

    # Instantiate dataloaders.
    train_dataloader = DataLoader(
        tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size
    )
    eval_dataloader = DataLoader(
        tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
    )

    set_seed(seed)

    # Instantiate the model (we build the model here so that the seed also control new weights initialization)
    model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)

    # We could avoid this line since the accelerator is set with `device_placement=True` (default value).
    # Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
    # creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
    model = model.to(accelerator.device)

    # Instantiate optimizer
    optimizer = AdamW(params=model.parameters(), lr=lr)

    # Instantiate scheduler
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=100,
        num_training_steps=(len(train_dataloader) * num_epochs) // gradient_accumulation_steps,
    )

    # Prepare everything
    # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
    # prepare method.
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
    )

    # We need to keep track of how many total steps we have iterated over
    overall_step = 0
    # We also need to keep track of the stating epoch so files are named properly
    starting_epoch = 0

    # Potentially load in the weights and states from a previous save
    if args.resume_from_checkpoint:
        if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
            accelerator.load_state(args.resume_from_checkpoint)
            path = os.path.basename(args.resume_from_checkpoint)
        else:
            # Get the most recent checkpoint
            dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
            dirs.sort(key=os.path.getctime)
            path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
        # Extract `epoch_{i}` or `step_{i}`
        training_difference = os.path.splitext(path)[0]

        if "epoch" in training_difference:
            starting_epoch = int(training_difference.replace("epoch_", "")) + 1
            resume_step = None
        else:
            resume_step = int(training_difference.replace("step_", ""))
            starting_epoch = resume_step // len(train_dataloader)
            resume_step -= starting_epoch * len(train_dataloader)

    # Now we train the model
    for epoch in range(starting_epoch, num_epochs):
        model.train()
        if args.with_tracking:
            total_loss = 0
        if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
            # We need to skip steps until we reach the resumed step
            if not args.use_stateful_dataloader:
                active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
            else:
                active_dataloader = train_dataloader
            overall_step += resume_step
        else:
            # After the first iteration though, we need to go back to the original dataloader
            active_dataloader = train_dataloader
        for step, batch in enumerate(active_dataloader):
            # We could avoid this line since we set the accelerator with `device_placement=True`.
            batch.to(accelerator.device)
            outputs = model(**batch)
            loss = outputs.loss
            loss = loss / gradient_accumulation_steps
            # We keep track of the loss at each epoch
            if args.with_tracking:
                total_loss += loss.detach().float()
            accelerator.backward(loss)
            if step % gradient_accumulation_steps == 0:
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()

            overall_step += 1

            if isinstance(checkpointing_steps, int):
                output_dir = f"step_{overall_step}"
                if overall_step % checkpointing_steps == 0:
                    if args.output_dir is not None:
                        output_dir = os.path.join(args.output_dir, output_dir)
                    accelerator.save_state(output_dir)

        model.eval()
        for step, batch in enumerate(eval_dataloader):
            # We could avoid this line since we set the accelerator with `device_placement=True`.
            batch.to(accelerator.device)
            with torch.no_grad():
                outputs = model(**batch)
            predictions = outputs.logits.argmax(dim=-1)
            predictions, references = accelerator.gather_for_metrics((predictions, batch["labels"]))
            metric.add_batch(
                predictions=predictions,
                references=references,
            )

        eval_metric = metric.compute()
        # Use accelerator.print to print only on the main process.
        accelerator.print(f"epoch {epoch}:", eval_metric)
        if args.with_tracking:
            accelerator.log(
                {
                    "accuracy": eval_metric["accuracy"],
                    "f1": eval_metric["f1"],
                    "train_loss": total_loss.item() / len(train_dataloader),
                    "epoch": epoch,
                },
                step=epoch,
            )

        if checkpointing_steps == "epoch":
            output_dir = f"epoch_{epoch}"
            if args.output_dir is not None:
                output_dir = os.path.join(args.output_dir, output_dir)
            accelerator.save_state(output_dir)

    accelerator.end_training()


def main():
    parser = argparse.ArgumentParser(description="Simple example of training script.")
    parser.add_argument(
        "--mixed_precision",
        type=str,
        default=None,
        choices=["no", "fp16", "bf16", "fp8"],
        help="Whether to use mixed precision. Choose"
        "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
        "and an Nvidia Ampere GPU.",
    )
    parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
    parser.add_argument(
        "--checkpointing_steps",
        type=str,
        default=None,
        help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
    )
    parser.add_argument(
        "--resume_from_checkpoint",
        type=str,
        default=None,
        help="If the training should continue from a checkpoint folder.",
    )
    parser.add_argument(
        "--use_stateful_dataloader",
        action="store_true",
        help="If the dataloader should be a resumable stateful dataloader.",
    )
    parser.add_argument(
        "--with_tracking",
        action="store_true",
        help="Whether to load in all available experiment trackers from the environment and use them for logging.",
    )
    parser.add_argument(
        "--output_dir",
        type=str,
        default=".",
        help="Optional save directory where all checkpoint folders will be stored. Default is the current working directory.",
    )
    parser.add_argument(
        "--project_dir",
        type=str,
        default="logs",
        help="Location on where to store experiment tracking logs` and relevent project information",
    )
    args = parser.parse_args()
    config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16}
    training_function(config, args)


if __name__ == "__main__":
    main()


================================================
FILE: examples/config_yaml_templates/README.md
================================================
# Config Zoo

This folder contains a variety of minimal configurations for `Accelerate` achieving certain goals. You can use these 
direct config YAML's, or build off of them for your own YAML's.

These are highly annoted versions, aiming to teach you what each section does.

Each config can be run via `accelerate launch --config_file {file} run_me.py`

`run_me.py` will then print out how the current environment is setup (the contents of the `AcceleratorState`)

================================================
FILE: examples/config_yaml_templates/deepspeed.yaml
================================================
# Similar to FSDP, we set the distributed type as DEEPSPEED
distributed_type: DEEPSPEED
# With DeepSpeed, we utilize a deepspeed config file for the entire configuration
deepspeed_config:
  # Can also be any of the config json's in accelerate/examples/deepspeed_config_templates
  deepspeed_config_file: ../deepspeed_config_templates/zero_stage1_config.json
  # If using ZeRO-3 and wanting to load big models in, this should be set to `true` so 
  # `transformers` uses the right `init` function
  zero3_init_flag: false # true 

# Finally we need to specify the number of accelerators to use
num_processes: 2
# Optionally we can set the mixed precision now instead of in the deepspeed config file,
# however this requires the `fp16` and `bf16` options to be set to `auto` in the deepspeed config file
# mixed_precision: "bf16"


================================================
FILE: examples/config_yaml_templates/fp8.yaml
================================================
# This config template simply setups up the TransformersEngine config (and a config for a single GPU),
# this can interop with the other configs in this folder
distributed_type: "NO"
mixed_precision: "fp8"
# Then we specify the fp8 configuration:
fp8_config:
  backend: TE # Can be TE | MS-AMP
  # The following are TE specific arguments.
  # See https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/common.html#common-api for more details
  amax_history_len: 1024
  fp8_format: E4M3
  interval: 1
  margin: 0
  override_linear_precision: [false, false, false]
  # Generally this should always be set to `false` to have the most realistic fp8 eval performance
  use_autocast_during_eval: false
  # If using MS-AMP, we ignore all of the prior and set a opt_level
  #opt_level: O1


================================================
FILE: examples/config_yaml_templates/fsdp.yaml
================================================
# Since we are doing FSDP (even though it's multi-accelerator), we need to specify the distributed type as FSDP
distributed_type: FSDP
# Can be one of "no", "fp16", or "bf16" (see `transformer_engine.yaml` for `fp8`, but it works for FSDP as well)
mixed_precision: 'bf16'
# Specify the number of accelerators to use
num_processes: 2
# Then we can specify the FSDP config
fsdp_config:
  fsdp_activation_checkpointing: false
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_backward_prefetch: BACKWARD_PRE
  fsdp_cpu_ram_efficient_loading: true
  fsdp_forward_prefetch: false
  fsdp_offload_params: false
  fsdp_sharding_strategy: FULL_SHARD
  fsdp_state_dict_type: SHARDED_STATE_DICT
  fsdp_sync_module_states: true
  fsdp_use_orig_params: true


================================================
FILE: examples/config_yaml_templates/multi_gpu.yaml
================================================
# Specify distributed_type as `MULTI_GPU` for DDP
distributed_type: "MULTI_GPU"
# Can be one of "no", "fp16", or "bf16" (see `transformer_engine.yaml` for `fp8`)
mixed_precision: "bf16"
# Specify the number of GPUs to use
num_processes: 2

================================================
FILE: examples/config_yaml_templates/multi_node.yaml
================================================
# This config template is for a multi-node setup. This assumes DDP, but can be interop'd with the other configs in this folder
# Generally it's recommended to look at the SLURM config template for a more robust multi-node setup
distributed_type: MULTI_GPU
# We need to specify the current machine's rank
machine_rank: 0
# We then need to specify the IP address and port of the main process
main_process_ip: '1234'
main_process_port: 9999
# We need to specify the number of machines
num_machines: 2
# We need to specify the *total* number of processes
num_processes: 8
# And then we need to specify how rdvz comms will be handled 
rdzv_backend: static # or c10d
# If the compute nodes are on the same network (cloud will more than likely be false)
same_network: false


================================================
FILE: examples/config_yaml_templates/multi_xpu.yaml
================================================
# Specify distributed_type as `MULTI_XPU` for DDP
distributed_type: "MULTI_XPU"
# Can be one of "no", "fp16", or "bf16" (see `transformer_engine.yaml` for `fp8`)
mixed_precision: "bf16"
# Specify the number of XPUs to use
num_processes: 2


================================================
FILE: examples/config_yaml_templates/run_me.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
A base script which outputs the accelerate config for the given environment
"""

from accelerate import Accelerator


accelerator = Accelerator()

accelerator.print(f"Accelerator state from the current environment:\n{accelerator.state}")
if accelerator.fp8_recipe_handler is not None:
    accelerator.print(f"FP8 config:\n{accelerator.fp8_recipe_handler}")
accelerator.end_training()


================================================
FILE: examples/config_yaml_templates/single_accelerator.yaml
================================================
# Since this is single GPU/XPU, we don't need distributed training
distributed_type: "NO"
# Can be one of "no", "fp16", or "bf16" (see `transformer_engine.yaml` for `fp8`)
mixed_precision: "bf16"


================================================
FILE: examples/cv_example.py
================================================
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import re

import numpy as np
import PIL
import torch
from timm import create_model
from torch.optim.lr_scheduler import OneCycleLR
from torch.utils.data import DataLoader, Dataset
from torchvision.transforms import Compose, RandomResizedCrop, Resize, ToTensor

from accelerate import Accelerator
from accelerate.utils import set_seed


########################################################################
# This is a fully working simple example to use Accelerate
#
# This example trains a ResNet50 on the Oxford-IIT Pet Dataset
# in any of the following settings (with the same script):
#   - single CPU or single GPU
#   - multi GPUS (using PyTorch distributed mode)
#   - (multi) TPUs
#   - fp16 (mixed-precision) or fp32 (normal precision)
#
# To run it in each of these various modes, follow the instructions
# in the readme for examples:
# https://github.com/huggingface/accelerate/tree/main/examples
#
########################################################################


# Function to get the label from the filename
def extract_label(fname):
    stem = fname.split(os.path.sep)[-1]
    return re.search(r"^(.*)_\d+\.jpg$", stem).groups()[0]


class PetsDataset(Dataset):
    def __init__(self, file_names, image_transform=None, label_to_id=None):
        self.file_names = file_names
        self.image_transform = image_transform
        self.label_to_id = label_to_id

    def __len__(self):
        return len(self.file_names)

    def __getitem__(self, idx):
        fname = self.file_names[idx]
        raw_image = PIL.Image.open(fname)
        image = raw_image.convert("RGB")
        if self.image_transform is not None:
            image = self.image_transform(image)
        label = extract_label(fname)
        if self.label_to_id is not None:
            label = self.label_to_id[label]
        return {"image": image, "label": label}


def training_function(config, args):
    # Initialize accelerator
    accelerator = Accelerator(cpu=args.cpu, mixed_precision=args.mixed_precision)

    # Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
    lr = config["lr"]
    num_epochs = int(config["num_epochs"])
    seed = int(config["seed"])
    batch_size = int(config["batch_size"])
    image_size = config["image_size"]
    if not isinstance(image_size, (list, tuple)):
        image_size = (image_size, image_size)

    # Grab all the image filenames
    file_names = [os.path.join(args.data_dir, fname) for fname in os.listdir(args.data_dir) if fname.endswith(".jpg")]

    # Build the label correspondences
    all_labels = [extract_label(fname) for fname in file_names]
    id_to_label = list(set(all_labels))
    id_to_label.sort()
    label_to_id = {lbl: i for i, lbl in enumerate(id_to_label)}

    # Set the seed before splitting the data.
    set_seed(seed)
    # Split our filenames between train and validation
    random_perm = np.random.permutation(len(file_names))
    cut = int(0.8 * len(file_names))
    train_split = random_perm[:cut]
    eval_split = random_perm[cut:]

    # For training we use a simple RandomResizedCrop
    train_tfm = Compose([RandomResizedCrop(image_size, scale=(0.5, 1.0)), ToTensor()])
    train_dataset = PetsDataset(
        [file_names[i] for i in train_split], image_transform=train_tfm, label_to_id=label_to_id
    )

    # For evaluation, we use a deterministic Resize
    eval_tfm = Compose([Resize(image_size), ToTensor()])
    eval_dataset = PetsDataset([file_names[i] for i in eval_split], image_transform=eval_tfm, label_to_id=label_to_id)

    # Instantiate dataloaders.
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, num_workers=4)
    eval_dataloader = DataLoader(eval_dataset, shuffle=False, batch_size=batch_size, num_workers=4)

    # Instantiate the model (we build the model here so that the seed also control new weights initialization)
    model = create_model("resnet50d", pretrained=True, num_classes=len(label_to_id))

    # We could avoid this line since the accelerator is set with `device_placement=True` (default value).
    # Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
    # creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
    model = model.to(accelerator.device)

    # Freezing the base model
    for param in model.parameters():
        param.requires_grad = False
    for param in model.get_classifier().parameters():
        param.requires_grad = True

    # We normalize the batches of images to be a bit faster.
    mean = torch.tensor(model.default_cfg["mean"])[None, :, None, None].to(accelerator.device)
    std = torch.tensor(model.default_cfg["std"])[None, :, None, None].to(accelerator.device)

    # Instantiate optimizer
    optimizer = torch.optim.Adam(params=model.parameters(), lr=lr / 25)

    # Instantiate learning rate scheduler
    lr_scheduler = OneCycleLR(optimizer=optimizer, max_lr=lr, epochs=num_epochs, steps_per_epoch=len(train_dataloader))

    # Prepare everything
    # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
    # prepare method.
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
    )

    # Now we train the model
    for epoch in range(num_epochs):
        model.train()
        for step, batch in enumerate(train_dataloader):
            # We could avoid this line since we set the accelerator with `device_placement=True`.
            batch = {k: v.to(accelerator.device) for k, v in batch.items()}
            inputs = (batch["image"] - mean) / std
            outputs = model(inputs)
            loss = torch.nn.functional.cross_entropy(outputs, batch["label"])
            accelerator.backward(loss)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        model.eval()
        accurate = 0
        num_elems = 0
        for _, batch in enumerate(eval_dataloader):
            # We could avoid this line since we set the accelerator with `device_placement=True`.
            batch = {k: v.to(accelerator.device) for k, v in batch.items()}
            inputs = (batch["image"] - mean) / std
            with torch.no_grad():
                outputs = model(inputs)
            predictions = outputs.argmax(dim=-1)
            predictions, references = accelerator.gather_for_metrics((predictions, batch["label"]))
            accurate_preds = predictions == references
            num_elems += accurate_preds.shape[0]
            accurate += accurate_preds.long().sum()

        eval_metric = accurate.item() / num_elems
        # Use accelerator.print to print only on the main process.
        accelerator.print(f"epoch {epoch}: {100 * eval_metric:.2f}")
    accelerator.end_training()


def main():
    parser = argparse.ArgumentParser(description="Simple example of training script.")
    parser.add_argument("--data_dir", required=True, help="The data folder on disk.")
    parser.add_argument(
        "--mixed_precision",
        type=str,
        default=None,
        choices=["no", "fp16", "bf16", "fp8"],
        help="Whether to use mixed precision. Choose"
        "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
        "and an Nvidia Ampere GPU.",
    )
    parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
    args = parser.parse_args()
    config = {"lr": 3e-2, "num_epochs": 3, "seed": 42, "batch_size": 64, "image_size": 224}
    training_function(config, args)


if __name__ == "__main__":
    main()


================================================
FILE: examples/deepspeed_config_templates/zero_stage1_config.json
================================================
{
    "fp16": {
        "enabled": true,
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },
    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "weight_decay": "auto",
            "torch_adam": true,
            "adam_w_mode": true
        }
    },
    "scheduler": {
        "type": "WarmupDecayLR",
        "params": {
            "warmup_min_lr": "auto",
            "warmup_max_lr": "auto",
            "warmup_num_steps": "auto",
            "total_num_steps": "auto"
        }
    },
    "zero_optimization": {
        "stage": 1,
        "allgather_partitions": true,
        "allgather_bucket_size": 2e8,
        "overlap_comm": true,
        "reduce_scatter": true,
        "reduce_bucket_size": "auto",
        "contiguous_gradients": true
    },
    "gradient_accumulation_steps": 1,
    "gradient_clipping": "auto",
    "steps_per_print": 2000,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false
}

================================================
FILE: examples/deepspeed_config_templates/zero_stage2_config.json
================================================
{
    "fp16": {
        "enabled": true,
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },
    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "weight_decay": "auto",
            "torch_adam": true,
            "adam_w_mode": true
        }
    },
    "scheduler": {
        "type": "WarmupDecayLR",
        "params": {
            "warmup_min_lr": "auto",
            "warmup_max_lr": "auto",
            "warmup_num_steps": "auto",
            "total_num_steps": "auto"
        }
    },
    "zero_optimization": {
        "stage": 2,
        "allgather_partitions": true,
        "allgather_bucket_size": 2e8,
        "overlap_comm": true,
        "reduce_scatter": true,
        "reduce_bucket_size": "auto",
        "contiguous_gradients": true
    },
    "gradient_accumulation_steps": 1,
    "gradient_clipping": "auto",
    "steps_per_print": 2000,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false
}

================================================
FILE: examples/deepspeed_config_templates/zero_stage2_offload_config.json
================================================
{
    "fp16": {
        "enabled": true,
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },
    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "weight_decay": "auto",
            "torch_adam": true,
            "adam_w_mode": true
        }
    },
    "scheduler": {
        "type": "WarmupDecayLR",
        "params": {
            "warmup_min_lr": "auto",
            "warmup_max_lr": "auto",
            "warmup_num_steps": "auto",
            "total_num_steps": "auto"
        }
    },
    "zero_optimization": {
        "stage": 2,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": true
        },
        "allgather_partitions": true,
        "allgather_bucket_size": 2e8,
        "overlap_comm": true,
        "reduce_scatter": true,
        "reduce_bucket_size": "auto",
        "contiguous_gradients": true
    },
    "gradient_accumulation_steps": 1,
    "gradient_clipping": "auto",
    "steps_per_print": 2000,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false
}

================================================
FILE: examples/deepspeed_config_templates/zero_stage3_config.json
================================================
{
    "fp16": {
        "enabled": true,
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },
    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "weight_decay": "auto"
        }
    },
    "scheduler": {
        "type": "WarmupDecayLR",
        "params": {
            "warmup_min_lr": "auto",
            "warmup_max_lr": "auto",
            "warmup_num_steps": "auto",
            "total_num_steps": "auto"
        }
    },
    "zero_optimization": {
        "stage": 3,
        "overlap_comm": true,
        "contiguous_gradients": true,
        "reduce_bucket_size": "auto",
        "stage3_prefetch_bucket_size": "auto",
        "stage3_param_persistence_threshold": "auto",
        "sub_group_size": 1e9,
        "stage3_max_live_parameters": 1e9,
        "stage3_max_reuse_distance": 1e9,
        "stage3_gather_16bit_weights_on_model_save": "auto"
    },
    "gradient_accumulation_steps": 1,
    "gradient_clipping": "auto",
    "steps_per_print": 2000,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false
}

================================================
FILE: examples/deepspeed_config_templates/zero_stage3_offload_config.json
================================================
{
    "fp16": {
        "enabled": true,
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },
    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "weight_decay": "auto"
        }
    },
    "scheduler": {
        "type": "WarmupDecayLR",
        "params": {
            "warmup_min_lr": "auto",
            "warmup_max_lr": "auto",
            "warmup_num_steps": "auto",
            "total_num_steps": "auto"
        }
    },
    "zero_optimization": {
        "stage": 3,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": true
        },
        "offload_param": {
            "device": "cpu",
            "pin_memory": true
        },
        "overlap_comm": true,
        "contiguous_gradients": true,
        "reduce_bucket_size": "auto",
        "stage3_prefetch_bucket_size": "auto",
        "stage3_param_persistence_threshold": "auto",
        "sub_group_size": 1e9,
        "stage3_max_live_parameters": 1e9,
        "stage3_max_reuse_distance": 1e9,
        "stage3_gather_16bit_weights_on_model_save": "auto"
    },
    "gradient_accumulation_steps": 1,
    "gradient_clipping": "auto",
    "steps_per_print": 2000,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false
}

================================================
FILE: examples/finetune_lm_tpu.py
================================================
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Example of fine-tuning a model on a TPU using FSDPv2, TRL and PEFT.
#
# Run the script with:
# python finetune_lm_tpu.py [--model_id MODEL_ID] [--dataset_id DATASET_ID]
#
# This script has been tested on a TPU v5 litepod-8.

import argparse

import torch
import torch_xla.runtime as xr
from datasets import load_dataset
from peft import LoraConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import SFTConfig, SFTTrainer


# FSDPv2 requires SPMD to be enabled.
xr.use_spmd()


def format_dolly(example, tokenizer):
    """Format Dolly dataset examples using the tokenizer's chat template."""
    user_content = example["instruction"]
    if len(example["context"]) > 0:
        user_content += f"\n\nContext: {example['context']}"

    messages = [
        {
            "role": "system",
            "content": "You are a helpful assistant",
        },
        {"role": "user", "content": user_content},
        {"role": "assistant", "content": example["response"]},
    ]

    return tokenizer.apply_chat_template(messages, tokenize=False)


def train(model_id, dataset):
    # Load model with low_cpu_mem_usage to avoid loading full model into CPU memory
    # FSDPv2 will handle sharding across TPUs
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        use_cache=False,
        torch_dtype=torch.bfloat16,
        low_cpu_mem_usage=True,
        device_map=None,  # Let FSDP handle device placement
    )

    tokenizer = AutoTokenizer.from_pretrained(model_id)

    if tokenizer.pad_token is None:
        if model.config.model_type == "llama":
            # Vanilla Llama models have a finetune gith pad id token
            tokenizer.pad_token = "<|finetune_right_pad_id|>"
        elif tokenizer.eos_token is not None:
            tokenizer.pad_token = tokenizer.eos_token
        else:
            raise ValueError(f"Cannot get or guess pad token for model {model_id}.")

    if tokenizer.chat_template is None:
        # Set chat template for Llama 3.1 format
        tokenizer.chat_template = (
            "{% for message in messages %}"
            "{% if message['role'] == 'system' %}"
            "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{{ message['content'] }}<|eot_id|>"
            "{% elif message['role'] == 'user' %}"
            "<|start_header_id|>user<|end_header_id|>\n\n{{ message['content'] }}<|eot_id|>"
            "{% elif message['role'] == 'assistant' %}"
            "<|start_header_id|>assistant<|end_header_id|>\n\n{{ message['content'] }}<|eot_id|>"
            "{% endif %}"
            "{% endfor %}"
            "{% if add_generation_prompt %}"
            "<|start_header_id|>assistant<|end_header_id|>\n\n"
            "{% endif %}"
        )

    # Try to guess the DecoderLayer class name, based on common model architectures
    transformer_layer_cls_to_wrap = model.model.layers[0].__class__.__name__

    # Get FSDP training arguments
    fsdp_training_args = {
        "fsdp": "full_shard",
        "fsdp_config": {
            "transformer_layer_cls_to_wrap": [transformer_layer_cls_to_wrap],
            "xla": True,
            "xla_fsdp_v2": True,
            "xla_fsdp_grad_ckpt": True,
        },
    }

    # Set up PEFT LoRA for fine-tuning.
    lora_config = LoraConfig(
        r=32,
        lora_alpha=128,
        lora_dropout=0.05,
        target_modules=["q_proj", "k_proj"],
        task_type="CAUSAL_LM",
    )

    sft_config = SFTConfig(
        gradient_checkpointing=False,  # Required on TPU, not supported
        max_length=1024,
        per_device_train_batch_size=4,
        num_train_epochs=3,
        max_steps=-1,
        output_dir="./output",
        optim="adafactor",
        logging_steps=1,
        dataloader_drop_last=True,  # Required for FSDPv2.
        dataset_text_field="text",
        packing=True,
        **fsdp_training_args,
    )

    # Set up the trainer
    trainer = SFTTrainer(
        model=model,
        train_dataset=dataset,
        args=sft_config,
        peft_config=lora_config,
        processing_class=tokenizer,
        formatting_func=lambda example: format_dolly(example, tokenizer),
    )

    trainer.train()


# =============================================================================
# Main Function
# =============================================================================
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Simple example of training script.")

    parser.add_argument(
        "--model_id", "-m", type=str, default="meta-llama/Llama-3.2-1B", help="Model id to use for training."
    )
    parser.add_argument(
        "--dataset_id",
        "-d",
        type=str,
        default="databricks/databricks-dolly-15k",
        help="Dataset id to use for training.",
    )

    args = parser.parse_args()

    # NOTE: this section can be adapted to load any dataset you want.
    dataset_id = args.dataset_id
    dolly_dataset = load_dataset(dataset_id, split="train")

    train(
        model_id=args.model_id,
        dataset=dolly_dataset,
    )


================================================
FILE: examples/inference/distributed/README.md
================================================
# Distributed inference examples

This folder contains a variety of tutorials for running distributed inference with the following strategy: 

Load an entire model onto each GPU and sending chunks of a batch through each GPU’s model copy at a time

## Installation

```bash
pip install accelerate torch
```

## Running code

You can either use `torchrun` or the recommended way of `accelerate launch` (without needing to run `accelerate config`) on each script:

```bash
accelerate launch --num_processes {NUM_GPUS} phi2.py
```

Or:

```bash
torchrun --nproc-per-node {NUM_GPUS} phi2.py
```


================================================
FILE: examples/inference/distributed/distributed_image_generation.py
================================================
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Originally by jiwooya1000, put together together by sayakpaul.
Documentation: https://huggingface.co/docs/diffusers/main/en/training/distributed_inference

Run:

accelerate launch distributed_image_generation.py --batch_size 8

# Enable memory optimizations for large models like SD3
accelerate launch distributed_image_generation.py --batch_size 8 --low_mem
"""

import os
import time

import fire
import torch
from datasets import load_dataset
from diffusers import DiffusionPipeline
from tqdm import tqdm

from accelerate import PartialState
from accelerate.utils import gather_object


START_TIME = time.strftime("%Y%m%d_%H%M%S")
DTYPE_MAP = {"fp32": torch.float32, "fp16": torch.float16, "bf16": torch.bfloat16}


def get_batches(items, batch_size):
    num_batches = (len(items) + batch_size - 1) // batch_size
    batches = []

    for i in range(num_batches):
        start_index = i * batch_size
        end_index = min((i + 1) * batch_size, len(items))
        batch = items[start_index:end_index]
        batches.append(batch)

    return batches


def main(
    ckpt_id: str = "PixArt-alpha/PixArt-Sigma-XL-2-1024-MS",
    save_dir: str = "./evaluation/examples",
    seed: int = 1,
    batch_size: int = 4,
    num_inference_steps: int = 20,
    guidance_scale: float = 4.5,
    dtype: str = "fp16",
    low_mem: bool = False,
):
    pipeline = DiffusionPipeline.from_pretrained(ckpt_id, torch_dtype=DTYPE_MAP[dtype])

    save_dir = save_dir + f"_{START_TIME}"

    parti_prompts = load_dataset("nateraw/parti-prompts", split="train")
    data_loader = get_batches(items=parti_prompts["Prompt"], batch_size=batch_size)

    distributed_state = PartialState()
    if low_mem:
        pipeline.enable_model_cpu_offload(gpu_id=distributed_state.device.index)
    else:
        pipeline = pipeline.to(distributed_state.device)

    if distributed_state.is_main_process:
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
            print(f"Directory '{save_dir}' created successfully.")
        else:
            print(f"Directory '{save_dir}' already exists.")

    count = 0
    for _, prompts_raw in tqdm(enumerate(data_loader), total=len(data_loader)):
        input_prompts = []

        with distributed_state.split_between_processes(prompts_raw) as prompts:
            generator = torch.manual_seed(seed)
            images = pipeline(
                prompts, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale, generator=generator
            ).images
            input_prompts.extend(prompts)

        distributed_state.wait_for_everyone()

        images = gather_object(images)
        input_prompts = gather_object(input_prompts)

        if distributed_state.is_main_process:
            for image, prompt in zip(images, input_prompts):
                count += 1
                temp_dir = os.path.join(save_dir, f"example_{count}")

                os.makedirs(temp_dir)
                prompt = "_".join(prompt.split())
                image.save(f"image_{prompt}.png")

    if distributed_state.is_main_process:
        print(f">>> Image Generation Finished. Saved in {save_dir}")


if __name__ == "__main__":
    fire.Fire(main)


================================================
FILE: examples/inference/distributed/distributed_speech_generation.py
================================================
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import json
import os
import pathlib
import queue
from concurrent.futures import ThreadPoolExecutor
from typing import Union

import fire
import scipy.io.wavfile
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, VitsModel

from accelerate import PartialState
from accelerate.utils import tqdm


"""
Requirements: transformers accelerate fire scipy datasets
pip install transformers accelerate fire scipy datasets
Example usage:
accelerate launch distributed_speech_generation.py --output_path outputs --batch_size 8 --num_workers 2 --dataset_split train
"""

"""
To run the speech generation
import scipy.io.wavfile
import numpy as np
from IPython.display import Audio
sample_rate, audio_data = scipy.io.wavfile.read('path_to_you_wav_file.wav')
audio_data = audio_data.astype(np.float32) / 32762.0
Audio(audio_data, rate=sample_rate)
"""


def load_pokemon_data(split: str, max_text_length: int):
    """Load Pokemon descriptions from the dataset"""
    ds = load_dataset("svjack/pokemon-blip-captions-en-zh", split=split)

    # Create dataset of dictionaries
    dataset = []
    for idx, text in enumerate(ds["en_text"]):
        if len(text.strip()) > 0:  # Skip empty descriptions
            dataset.append(
                {
                    "id": f"pokemon_{idx:06d}",
                    "text": text.strip()[:max_text_length],  # Truncate long descriptions
                    "original_text": text.strip(),  # Keep original for metadata
                }
            )
    return dataset


class ExistsFilter:
    def __init__(self, output_dir: Union[pathlib.Path, str]):
        current_files = [f.split(".wav")[0] for f in os.listdir(output_dir) if f.endswith(".wav")]
        self.processed_files = set(current_files)
        print(f"Existing audio files found: {len(self.processed_files)}.")

    def __call__(self, x):
        return x["id"] not in self.processed_files


def preprocess_fn(sample, tokenizer, max_text_length: int):
    inputs = tokenizer(sample["text"], padding=False, truncation=True, max_length=max_text_length, return_tensors="pt")

    return {
        "input_ids": inputs["input_ids"][0].tolist(),
        "attention_mask": inputs["attention_mask"][0].tolist(),
        "id": sample["id"],
        "text": sample["text"],
        "original_text": sample["original_text"],
    }


def collate_fn(examples, tokenizer):
    """Collate batch of examples with proper padding"""
    # Find max length in this batch
    max_length = max(len(example["input_ids"]) for example in examples)

    # Pad sequences to max_length
    input_ids_list = []
    attention_mask_list = []

    for example in examples:
        # Get current lengths
        curr_len = len(example["input_ids"])
        padding_length = max_length - curr_len

        # Pad sequences
        padded_input_ids = example["input_ids"] + [tokenizer.pad_token_id] * padding_length
        padded_attention_mask = example["attention_mask"] + [0] * padding_length

        input_ids_list.append(padded_input_ids)
        attention_mask_list.append(padded_attention_mask)

    # Convert to tensors
    input_ids = torch.tensor(input_ids_list, dtype=torch.long)
    attention_mask = torch.tensor(attention_mask_list, dtype=torch.long)

    ids = [example["id"] for example in examples]
    texts = [example["text"] for example in examples]
    original_texts = [example["original_text"] for example in examples]

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "ids": ids,
        "texts": texts,
        "original_texts": original_texts,
    }


def create_dataloader(dataset, batch_size, distributed_state, tokenizer):
    """Create dataloader with preprocessing"""
    processed_dataset = [preprocess_fn(item, tokenizer, max_text_length=200) for item in dataset]

    # Split dataset for distributed processing
    if distributed_state.num_processes > 1:
        chunk_size = len(processed_dataset) // distributed_state.num_processes
        start_idx = distributed_state.process_index * chunk_size
        end_idx = (
            start_idx + chunk_size
            if distributed_state.process_index < distributed_state.num_processes - 1
            else len(processed_dataset)
        )
        processed_dataset = processed_dataset[start_idx:end_idx]

    # Create batches
    batches = []
    for i in range(0, len(processed_dataset), batch_size):
        batch = processed_dataset[i : i + batch_size]
        batches.append(collate_fn(batch, tokenizer))
    return batches


def save_results(output_queue: queue.Queue, output_dir: pathlib.Path, sampling_rate: int):
    while True:
        try:
            item = output_queue.get(timeout=5)
            if item is None:
                break
            waveforms, ids, texts, original_texts = item

            # Save each audio file and its metadata
            for waveform, file_id, text, original_text in zip(waveforms, ids, texts, original_texts):
                # Save audio
                wav_path = output_dir / f"{file_id}.wav"
                scipy.io.wavfile.write(wav_path, rate=sampling_rate, data=waveform.cpu().float().numpy())

                # Save metadata with both truncated and original text
                metadata = {
                    "text_used": text,
                    "original_text": original_text,
                    "model": "facebook/mms-tts-eng",
                    "sampling_rate": sampling_rate,
                }
                metadata_path = output_dir / f"{file_id}_metadata.json"
                with metadata_path.open("w") as f:
                    json.dump(metadata, f, indent=4)

        except queue.Empty:
            continue


def main(
    output_path: str = "speech_data",
    batch_size: int = 8,
    num_workers: int = 2,
    dataset_split: str = "train",
    model_name: str = "facebook/mms-tts-eng",
    max_text_length: int = 200,
):
    output_dir = pathlib.Path(output_path)
    output_dir.mkdir(parents=True, exist_ok=True)
    distributed_state = PartialState()

    # Load model and tokenizer
    model = VitsModel.from_pretrained(
        model_name,
        device_map=distributed_state.device,
        torch_dtype=torch.float32,
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Load and filter data
    dataset = load_pokemon_data(dataset_split, max_text_length)
    exist_filter = ExistsFilter(output_dir)
    dataset = [item for item in dataset if exist_filter(item)]

    distributed_state.print(f"Processing {len(dataset)} Pokemon descriptions")

    # Create dataloader
    batches = create_dataloader(dataset, batch_size, distributed_state, tokenizer)

    # Setup output queue and save thread
    output_queue = queue.Queue()
    save_thread = ThreadPoolExecutor(max_workers=num_workers)
    save_future = save_thread.submit(save_results, output_queue, output_dir, model.config.sampling_rate)

    try:
        for batch in tqdm(batches, desc="Generating Pokemon descriptions"):
            with torch.no_grad():
                outputs = model(
                    input_ids=batch["input_ids"].to(distributed_state.device, dtype=torch.long),
                    attention_mask=batch["attention_mask"].to(distributed_state.device, dtype=torch.long),
                ).waveform

                output_queue.put((outputs, batch["ids"], batch["texts"], batch["original_texts"]))
    finally:
        output_queue.put(None)
        save_thread.shutdown(wait=True)

    save_future.result()


if __name__ == "__main__":
    fire.Fire(main)


================================================
FILE: examples/inference/distributed/florence2.py
================================================
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import os
import pathlib
import queue
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from typing import Union

import fire
import torch
import webdataset as wds
from huggingface_hub.utils import insecure_hashlib
from PIL import Image
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoProcessor

from accelerate import PartialState


"""
Additional requirements: flash_attn einops timm webdataset fire tqdm huggingface_hub
pip install flash_attn einops timm webdataset fire tqdm huggingface_hub

Example:

accelerate launch --num_processes=2 florence2.py --data_path "https://huggingface.co/datasets/pixparse/cc3m-wds/resolve/main/cc3m-train-0000.tar" --output_path outputs --batch_size 12 --num_workers 1 --prompt "<CAPTION>"
"""


def main(
    data_path: str,
    output_path: str,
    batch_size: int,
    num_workers: int,
    prompt: str = "<MORE_DETAILED_CAPTION>",
    model_name: str = "microsoft/Florence-2-large",
    max_new_tokens: int = 1024,
    num_beams: int = 3,
):
    output_dir = pathlib.Path(output_path)

    distributed_state = PartialState()

    if distributed_state.is_main_process:
        output_dir.mkdir(exist_ok=True)

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map=distributed_state.device,
        torch_dtype=torch.float16,
        trust_remote_code=True,
    )

    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True, clean_up_tokenization_spaces=True)

    class ExistsFilter:
        def __init__(self, output_dir: Union[pathlib.Path, str]):
            current_training_img_hashes = [f.split(".jpg")[0] for f in os.listdir(output_dir) if f.endswith(".jpg")]
            self.current_training_img_hashes = set(current_training_img_hashes)
            if distributed_state.is_main_process:
                print(f"Existing images found: {len(self.current_training_img_hashes)}.")

        def __call__(self, x):
            if len(self.current_training_img_hashes) > 0:
                if x["img_hash"] in self.current_training_img_hashes:
                    return False
                else:
                    return True
            else:
                return True

    def preprocess_fn(sample, processor):
        image: Image.Image = sample["jpg"].convert("RGB")
        img_hash = insecure_hashlib.sha1(image.tobytes()).hexdigest()
        inputs = processor(
            text=prompt,
            images=image,
            return_tensors="pt",
        )
        return {
            "input_ids": inputs["input_ids"],
            "pixel_values": inputs["pixel_values"],
            "image": image,
            "img_hash": img_hash,
            "original_caption": sample["txt"],
        }

    def collate_fn(examples):
        input_ids = torch.cat([example["input_ids"] for example in examples])
        pixel_values = torch.cat([example["pixel_values"] for example in examples])
        images = [example["image"] for example in examples]
        img_hashes = [example["img_hash"] for example in examples]
        captions = [example["original_caption"] for example in examples]
        return {
            "input_ids": input_ids,
            "pixel_values": pixel_values,
            "images": images,
            "img_hashes": img_hashes,
            "original_captions": captions,
        }

    exist_filter = ExistsFilter(output_dir)
    dataset = (
        wds.WebDataset(
            data_path,
            handler=wds.warn_and_continue,
            nodesplitter=None,
            shardshuffle=False,
            empty_check=False,
        )
        .decode("pil", handler=wds.warn_and_continue)
        .map(partial(preprocess_fn, processor=processor), handler=wds.warn_and_continue)
    )
    if len(exist_filter.current_training_img_hashes) > 0:
        dataset = dataset.select(exist_filter)
    dataset = dataset.batched(
        batch_size,
        partial=False,
        collation_fn=collate_fn,
    )
    dataloader = wds.WebLoader(
        dataset,
        batch_size=None,
        num_workers=num_workers,
        pin_memory=True,
        persistent_workers=True,
    )

    def save_results(output_queue: queue.Queue, output_dir: pathlib.Path, processor):
        while True:
            try:
                item = output_queue.get(timeout=5)
                if item is None:
                    break
                original_captions, predictions, images, img_hashes = item
                predicted_captions = processor.batch_decode(
                    predictions,
                    skip_special_tokens=False,
                )
                for caption, pred_caption, image, img_hash in zip(
                    original_captions, predicted_captions, images, img_hashes
                ):
                    processed_caption = processor.post_process_generation(
                        pred_caption, task=prompt, image_size=(image.width, image.height)
                    )[prompt]
                    img_path = output_dir.joinpath(f"{img_hash}.jpg")
                    image.save(img_path)

                    caption_dict = {"original": caption, "predicted": processed_caption}
                    with output_dir.joinpath(f"{img_hash}_caption.json").open("w") as f:
                        json.dump(caption_dict, f, indent=4)

            except queue.Empty:
                continue

    output_queue = queue.Queue()
    save_thread = ThreadPoolExecutor(max_workers=num_workers)
    save_future = save_thread.submit(save_results, output_queue, output_dir, processor)

    try:
        for _, batch_raw in tqdm(
            enumerate(dataloader),
            disable=not distributed_state.is_main_process,
        ):
            with distributed_state.split_between_processes(batch_raw) as batch:
                outputs = model.generate(
                    input_ids=batch["input_ids"].to(distributed_state.device),
                    pixel_values=batch["pixel_values"].to(distributed_state.device, model.dtype),
                    max_new_tokens=max_new_tokens,
                    num_beams=num_beams,
                )
                output_queue.put(
                    (
                        batch["original_captions"],
                        outputs,
                        batch["images"],
                        batch["img_hashes"],
                    )
                )
    finally:
        output_queue.put(None)
        save_thread.shutdown(wait=True)

    save_future.result()


if __name__ == "__main__":
    fire.Fire(main)


================================================
FILE: examples/inference/distributed/llava_next_video.py
================================================
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import os
import pathlib
import queue
import time
from concurrent.futures import ThreadPoolExecutor

import av
import fire
import numpy as np
import torch
from huggingface_hub import snapshot_download
from tqdm import tqdm
from transformers import LlavaNextVideoForConditionalGeneration, LlavaNextVideoProcessor

from accelerate import PartialState


START_TIME = time.strftime("%Y%m%d_%H%M%S")
DTYPE_MAP = {"fp32": torch.float32, "fp16": torch.float16, "bf16": torch.bfloat16}


"""
Example:

accelerate launch llava_next_video.py
"""


def save_results(output_queue: queue.Queue, output_dir: pathlib.Path):
    count = 0
    while True:
        try:
            item = output_queue.get(timeout=5)
            if item is None:
                break
            prompt, video, generated_text = item
            example_file = f"example_{count}"
            temp_dir = os.path.join(output_dir, example_file)

            metadata = {"prompt": prompt, "video": video, "generated_text": generated_text}
            with open(temp_dir, "w") as f:
                json.dump(metadata, f, indent=4)
            count += 1

        except queue.Empty:
            continue


def get_batches(processed_videos, batch_size):
    num_batches = (len(processed_videos) + batch_size - 1) // batch_size
    batches = []

    for i in range(num_batches):
        start_index = i * batch_size
        end_index = min((i + 1) * batch_size, len(processed_videos))
        batch = processed_videos[start_index:end_index]
        batches.append(batch)

    return batches


def read_video_pyav(container, indices):
    """
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
        indices (`List[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    """
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])


def get_video_paths(video_dir):
    """Get paths to all video files in the directory and its subdirectories."""
    video_extensions = (".mp4", ".avi", ".mov", ".mkv")  # Add more extensions if needed
    video_paths = []

    for root, _, files in os.walk(video_dir):
        for file in files:
            if file.lower().endswith(video_extensions):
                video_paths.append(os.path.join(root, file))

    return video_paths


def process_videos(video_paths, processor, prompt, frames_per_video):
    """Process a batch of videos and prepare them for the model."""
    batch_inputs = []

    for video_path in video_paths:
        try:
            with av.open(video_path) as container:
                total_frames = container.streams.video[0].frames
                indices = np.arange(0, total_frames, total_frames / frames_per_video).astype(int)
                clip = read_video_pyav(container, indices)

                processed = processor(text=prompt, videos=clip, return_tensors="pt")
                batch_inputs.append(
                    {
                        "input_ids": processed["input_ids"],
                        "pixel_values_videos": processed["pixel_values_videos"],
                        "video": video_path,
                    }
                )

        except Exception as e:
            print(f"Error processing video {video_path}: {str(e)}")
            continue

    return batch_inputs


def main(
    model_name: str = "llava-hf/LLaVA-NeXT-Video-7B-hf",
    save_dir: str = "./evaluation/examples",
    prompt: str = "USER: <video>\nGenerate caption ASSISTANT:",
    frames_per_video: int = 8,
    max_new_tokens: int = 100,
    batch_size: int = 4,
    dtype: str = "fp16",
    num_workers: int = 1,
    low_mem: bool = True,
):
    # Start up the distributed environment without needing the Accelerator.
    distributed_state = PartialState()

    processor = LlavaNextVideoProcessor.from_pretrained(model_name)
    model = LlavaNextVideoForConditionalGeneration.from_pretrained(
        model_name, torch_dtype=DTYPE_MAP[dtype], low_cpu_mem_usage=low_mem, device_map=distributed_state.device
    )

    if distributed_state.is_main_process:
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
            print(f"Directory '{save_dir}' created successfully.")
        else:
            print(f"Directory '{save_dir}' already exists.")

    videos_dir = snapshot_download(repo_id="malterei/LLaVA-Video-small-swift", repo_type="dataset")
    video_paths = get_video_paths(videos_dir)
    processed_videos = process_videos(video_paths, processor, prompt, frames_per_video)
    batches = get_batches(processed_videos, batch_size)

    output_queue = queue.Queue()
    save_thread = ThreadPoolExecutor(max_workers=num_workers)
    save_future = save_thread.submit(save_results, output_queue, save_dir)
    for _, batch_raw in tqdm(enumerate(batches), total=len(batches)):
        try:
            with distributed_state.split_between_processes(batch_raw) as batched_inputs:
                for batch in batched_inputs:
                    output = model.generate(
                        input_ids=batch["input_ids"].to(distributed_state.device),
                        pixel_values_videos=batch["pixel_values_videos"].to(distributed_state.device, model.dtype),
                        max_new_tokens=max_new_tokens,
                    )
                    generated_text = processor.batch_decode(output, skip_special_tokens=True)
                    output_queue.put((prompt, batch["video"], generated_text))
        finally:
            output_queue.put(None)
            save_thread.shutdown(wait=True)

    save_future.result()
    distributed_state.destroy_process_group()


if __name__ == "__main__":
    fire.Fire(main)


================================================
FILE: examples/inference/distributed/phi2.py
================================================
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

from accelerate import PartialState
from accelerate.utils import gather_object


# Start up the distributed environment without needing the Accelerator.
distributed_state = PartialState()

# You can change the model to any LLM such as mistralai/Mistral-7B-v0.1 or meta-llama/Llama-2-7b-chat-hf
model_name = "microsoft/phi-2"
model = AutoModelForCausalLM.from_pretrained(
    model_name, device_map=distributed_state.device, torch_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
# Need to set the padding token to the eos token for generation
tokenizer.pad_token = tokenizer.eos_token

prompts = [
    "I would like to",
    "hello how are you",
    "what is going on",
    "roses are red and",
    "welcome to the hotel",
]

# You can change the batch size depending on your GPU RAM
batch_size = 2
# We set it to 8 since it is better for some hardware. More information here https://github.com/huggingface/tokenizers/issues/991
pad_to_multiple_of = 8

# Split into batches
# We will get the following results:
# [ ["I would like to", "hello how are you"], [ "what is going on", "roses are red and"], [ "welcome to the hotel"] ]
formatted_prompts = [prompts[i : i + batch_size] for i in range(0, len(prompts), batch_size)]

# Apply padding on the left since we are doing generation
padding_side_default = tokenizer.padding_side
tokenizer.padding_side = "left"
# Tokenize each batch
tokenized_prompts = [
    tokenizer(formatted_prompt, padding=True, pad_to_multiple_of=pad_to_multiple_of, return_tensors="pt")
    for formatted_prompt in formatted_prompts
]
# Put back the original padding behavior
tokenizer.padding_side = padding_side_default

completions_per_process = []
# We automatically split the batched data we passed to it across all the processes. We also set apply_padding=True
# so that the GPUs will have the same number of prompts, and you can then gather the results.
# For example, if we have 2 gpus, the distribution will be:
# GPU 0: ["I would like to", "hello how are you"],  "what is going on", "roses are red and"]
# GPU 1: ["welcome to the hotel"], ["welcome to the hotel"] -> this prompt is duplicated to ensure that all gpus have the same number of prompts
with distributed_state.split_between_processes(tokenized_prompts, apply_padding=True) as batched_prompts:
    for batch in batched_prompts:
        # Move the batch to the device
        batch = batch.to(distributed_state.device)
        # We generate the text, decode it and add it to the list completions_per_process
        outputs = model.generate(**batch, max_new_tokens=20)
        generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        completions_per_process.extend(generated_text)

# We are gathering string, so we need to use gather_object.
# If you need to gather tensors, you can use gather from accelerate.utils
completions_gather = gather_object(completions_per_process)

# Drop duplicates produced by apply_padding in split_between_processes
completions = completions_gather[: len(prompts)]

distributed_state.print(completions)


================================================
FILE: examples/inference/distributed/stable_diffusion.py
================================================
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
from diffusers import DiffusionPipeline

from accelerate import PartialState  # Can also be Accelerator or AcceleratorState


pipe = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16)
distributed_state = PartialState()
pipe.to(distributed_state.device)

# Assume two processes
# On the first GPU, the prompts will be ["a dog", "a cat"],
# and on the second GPU it will be ["a chicken", "a chicken"].
# Make sure to drop the final sample, as it will be a duplicate of the previous one.
with distributed_state.split_between_processes(["a dog", "a cat", "a chicken"], apply_padding=True) as prompt:
    result = pipe(prompt).images


================================================
FILE: examples/inference/pippy/README.md
================================================
# Distributed inference examples with PiPPy

This repo contains a variety of tutorials for using the [PiPPy](https://github.com/PyTorch/PiPPy) pipeline parallelism library with accelerate. You will find examples covering:

1. How to trace the model using `accelerate.prepare_pippy`
2. How to specify inputs based on what the model expects (when to use `kwargs`, `args`, and such)
3. How to gather the results at the end.

## Installation

This requires the `main` branch of accelerate (or a version at least 0.27.0),  `pippy` version of 0.2.0 or greater, and at least python 3.9. Please install using `pip install .` to pull from the `setup.py` in this repo, or run manually:

```bash
pip install 'accelerate>=0.27.0' 'torchpippy>=0.2.0'
```

## Running code

You can either use `torchrun` or the recommended way of `accelerate launch` (without needing to run `accelerate config`) on each script:

```bash
accelerate launch bert.py
```

Or:

```bash
accelerate launch --num_processes {NUM_GPUS} bert.py
```

Or:

```bash
torchrun --nproc-per-node {NUM_GPUS} bert.py
```

## General speedups

One can expect that PiPPy will outperform native model parallism by a multiplicative factor since all GPUs are running at all times with inputs, rather than one input being passed through a GPU at a time waiting for the prior to finish. 

Below are some benchmarks we have found when using the accelerate-pippy integration for a few models when running on 2x4090's:

### Bert

|  | Accelerate/Sequential | PiPPy + Accelerate |
|---|---|---|
| First batch | 0.2137s | 0.3119s |
| Average of 5 batches | 0.0099s | **0.0062s** |

### GPT2

|  | Accelerate/Sequential | PiPPy + Accelerate |
|---|---|---|
| First batch | 0.1959s | 0.4189s |
| Average of 5 batches | 0.0205s | **0.0126s** |

### T5

|  | Accelerate/Sequential | PiPPy + Accelerate |
|---|---|---|
| First batch | 0.2789s | 0.3809s |
| Average of 5 batches | 0.0198s | **0.0166s** |

================================================
FILE: examples/inference/pippy/bert.py
================================================
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time

import torch
from transformers import AutoModelForMaskedLM

from accelerate import PartialState, prepare_pippy
from accelerate.test_utils import torch_device
from accelerate.utils import set_seed


synchronize_func = getattr(torch, torch_device, torch.cuda).synchronize

# Set the random seed to have reproducable outputs
set_seed(42)

# Create an example model
model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
model.eval()

# Input configs
# Create example inputs for the model
input = torch.randint(
    low=0,
    high=model.config.vocab_size,
    size=(1, 512),  # bs x seq_len
    device="cpu",
    dtype=torch.int64,
    requires_grad=False,
)


# Create a pipeline stage from the model
# Using `auto` is equivalent to letting `device_map="auto"` figure
# out device mapping and will also split the model according to the
# number of total GPUs available if it fits on one GPU
model = prepare_pippy(model, split_points="auto", example_args=(input,))

# You can pass `gather_output=True` to have the output from the model
# available on all GPUs
# model = prepare_pippy(model, split_points="auto", example_args=(input,), gather_output=True)

# Create new inputs of the expected size (n_processes)
input = torch.randint(
    low=0,
    high=model.config.vocab_size,
    size=(2, 512),  # bs x seq_len
    device="cpu",
    dtype=torch.int64,
    requires_grad=False,
)

# Move the inputs to the first device
input = input.to(torch_device)

# Take an average of 5 times
# Measure first batch
synchronize_func()
start_time = time.time()
with torch.no_grad():
    output = model(input)
synchronize_func()
end_time = time.time()
first_batch = end_time - start_time

# Now that hpu is init, measure after
synchronize_func()
start_time = time.time()
for i in range(5):
    with torch.no_grad():
        output = model(input)
synchronize_func()
end_time = time.time()

# The outputs are only on the final process by default
if PartialState().is_last_process:
    output = torch.stack(tuple(output[0]))
    print(f"Time of first pass: {first_batch}")
    print(f"Average time per batch: {(end_time - start_time) / 5}")
PartialState().destroy_process_group()


================================================
FILE: examples/inference/pippy/gpt2.py
================================================
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time

import torch
from transformers import AutoModelForSequenceClassification

from accelerate import PartialState, prepare_pippy
from accelerate.test_utils import torch_device
from accelerate.utils import set_seed


synchronize_func = getattr(torch, torch_device, torch.cuda).synchronize

# Set the random seed to have reproducable outputs
set_seed(42)

# Create an example model
model = AutoModelForSequenceClassification.from_pretrained("gpt2")
model.eval()

# Input configs
# Create example inputs for the model
input = torch.randint(
    low=0,
    high=model.config.vocab_size,
    size=(1, 1024),  # bs x seq_len
    device="cpu",
    dtype=torch.int64,
    requires_grad=False,
)

# Create a pipeline stage from the model
# Using `auto` is equivalent to letting `device_map="auto"` figure
# out device mapping and will also split the model according to the
# number of total GPUs available if it fits on one GPU
model = prepare_pippy(model, split_points="auto", example_args=(input,))

# You can pass `gather_output=True` to have the output from the model
# available on all GPUs
# model = prepare_pippy(model, split_points="auto", example_args=(input,), gather_output=True)

# Create new inputs of the expected size (n_processes)
input = torch.randint(
    low=0,
    high=model.config.vocab_size,
    size=(2, 1024),  # bs x seq_len
    device="cpu",
    dtype=torch.int64,
    requires_grad=False,
)

# Move the inputs to the first device
input = input.to(torch_device)

# Take an average of 5 times
# Measure first batch
synchronize_func()
start_time = time.time()
with torch.no_grad():
    output = model(input)
synchronize_func()
end_time = time.time()
first_batch = end_time - start_time

# Now that device/backend is init, measure after
synchronize_func()
start_time = time.time()
for i in range(5):
    with torch.no_grad():
        output = model(input)
synchronize_func()
end_time = time.time()

# The outputs are only on the final process by default
if PartialState().is_last_process:
    output = torch.stack(tuple(output[0]))
    print(f"Time of first pass: {first_batch}")
    print(f"Average time per batch: {(end_time - start_time) / 5}")
PartialState().destroy_process_group()


================================================
FILE: examples/inference/pippy/llama.py
================================================
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

from accelerate import PartialState, prepare_pippy


# sdpa implementation which is the default torch>2.1.2 fails with the tracing + attention mask kwarg
# with attn_implementation="eager" mode, the forward is very slow for some reason
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-chat-hf", low_cpu_mem_usage=True, attn_implementation="sdpa"
)
model.eval()

# Input configs
# Create example inputs for the model
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
prompts = ("I would like to", "I really like to")  # bs = 2, sending 2 per process
tokenizer.pad_token = tokenizer.eos_token
inputs = tokenizer(prompts, return_tensors="pt", padding=True)

# Create a pipeline stage from the model
# Using `auto` is equivalent to letting `device_map="auto"` figure
# out device mapping and will also split the model according to the
# number of total GPUs available if it fits on one GPU
model = prepare_pippy(model, split_points="auto", example_kwargs=inputs)

# You can pass `gather_output=True` to have the output from the model
# available on all GPUs
# model = prepare_pippy(model, split_points="auto", example_args=(input,), gather_output=True)

# currently we don't support `model.generate`
# output = model.generate(**inputs, max_new_tokens=1)
prompts = ("I would like to", "I really like to", "The weather is pretty")  # bs = 3
inputs = tokenizer(prompts, return_tensors="pt", padding=True)
inputs = inputs.to(0)
with torch.no_grad():
    output = model(**inputs)

# The outputs are only on the final process by default
if PartialState().is_last_process:
    next_token_logits = output[0][:, -1, :]
    next_token = torch.argmax(next_token_logits, dim=-1)
    print(tokenizer.batch_decode(next_token))
PartialState().destroy_process_group()


================================================
FILE: examples/inference/pippy/requirements.txt
================================================
accelerate
pippy>=0.2.0

================================================
FILE: examples/inference/pippy/t5.py
================================================
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time

import torch
from packaging import version
from transformers import AutoModelForSeq2SeqLM

from accelerate import PartialState, prepare_pippy
from accelerate import __version__ as accelerate_version
from accelerate.test_utils import torch_device
from accelerate.utils import set_seed


synchronize_func = getattr(torch, torch_device, torch.cuda).synchronize

if version.parse(accelerate_version) > version.parse("0.33.0"):
    raise RuntimeError(
        "Using encoder/decoder models is not supported with the `torch.pipelining` integration or accelerate>=0.34.0. "
        "Please use a lower accelerate version and `torchpippy`, which this example uses."
    )


# Set the random seed to have reproducable outputs
set_seed(42)

# Create an example model
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
model.eval()

# Input configs
# Create example inputs for the model
input = torch.randint(
    low=0,
    high=model.config.vocab_size,
    size=(2, 1024),  # bs x seq_len
    device="cpu",
    dtype=torch.int64,
    requires_grad=False,
)

example_inputs = {"input_ids": input, "decoder_input_ids": input}

# Create a pipeline stage from the model
# Using `auto` is equivalent to letting `device_map="auto"` figure
# out device mapping and will also split the model according to the
# number of total GPUs available if it fits on one GPU
model = prepare_pippy(
    model,
    no_split_module_classes=["T5Block"],
    example_kwargs=example_inputs,
)

# You can pass `gather_output=True` to have the output from the model
# available on all GPUs
# model = prepare_pippy(
#     model,
#     no_split_module_classes=["T5Block"],
#     example_kwargs=example_inputs,
#     gather_outputs=True
# )

# The model expects a tuple during real inference
# with the data on the first device
args = (example_inputs["input_ids"].to(0), example_inputs["decoder_input_ids"].to(0))

# Take an average of 5 times
# Measure first batch
synchronize_func()
start_time = time.time()
with torch.no_grad():
    output = model(*args)
synchronize_func()
end_time = time.time()
first_batch = end_time - start_time

# Now that device is init, measure after
synchronize_func()
start_time = time.time()
for i in range(5):
    with torch.no_grad():
        output = model(*args)
synchronize_func()
end_time = time.time()

# The outputs are only on the final process by default
if PartialState().is_last_process:
    output = torch.stack(tuple(output[0]))
    print(f"Time of first pass: {first_batch}")
    print(f"Average time per batch: {(end_time - start_time) / 5}")
PartialState().destroy_process_group()


================================================
FILE: examples/multigpu_remote_launcher.py
================================================
# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse

import runhouse as rh
import torch
from nlp_example import training_function

from accelerate.utils import PrepareForLaunch, patch_environment


def launch_train(*args):
    num_processes = torch.cuda.device_count()
    print(f"Device count: {num_processes}")
    with patch_environment(
        world_size=num_processes, master_addr="127.0.0.1", master_port="29500", mixed_precision=args[1].mixed_precision
    ):
        launcher = PrepareForLaunch(training_function, distributed_type="MULTI_GPU")
        torch.multiprocessing.start_processes(launcher, args=args, nprocs=num_processes, start_method="spawn")


if __name__ == "__main__":
    # Refer to https://runhouse-docs.readthedocs-hosted.com/en/main/rh_primitives/cluster.html#hardware-setup
    # for cloud access setup instructions (if using on-demand hardware), and for API specifications.

    # on-demand GPU
    # gpu = rh.cluster(name='rh-cluster', instance_type='V100:1', provider='cheapest', use_spot=False)  # single GPU
    gpu = rh.cluster(name="rh-cluster", instance_type="V100:4", provider="cheapest", use_spot=False)  # multi GPU
    gpu.up_if_not()

    # on-prem GPU
    # gpu = rh.cluster(
    #           ips=["ip_addr"], ssh_creds={ssh_user:"<username>", ssh_private_key:"<key_path>"}, name="rh-cluster"
    #       )

    # Set up remote function
    reqs = [
        "pip:./",
        "transformers",
        "datasets",
        "evaluate",
        "tqdm",
        "scipy",
        "scikit-learn",
        "tensorboard",
        "torch --upgrade --extra-index-url https://download.pytorch.org/whl/cu117",
    ]
    launch_train_gpu = rh.function(fn=launch_train, system=gpu, reqs=reqs, name="train_bert_glue")

    # Define train args/config, run train function
    train_args = argparse.Namespace(cpu=False, mixed_precision="fp16")
    config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16}
    launch_train_gpu(config, train_args, stream_logs=True)

    # Alternatively, we can just run as instructed in the README (but only because there's already a wrapper CLI):
    # gpu.install_packages(reqs)
    # gpu.run(['accelerate launch --multi_gpu accelerate/examples/nlp_example.py'])


================================================
FILE: examples/nlp_example.py
================================================
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse

import evaluate
import torch
from datasets import load_dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed

from accelerate import Accelerator, DistributedType


########################################################################
# This is a fully working simple example to use Accelerate
#
# This example trains a Bert base model on GLUE MRPC
# in any of the following settings (with the same script):
#   - single CPU or single GPU
#   - multi GPUS (using PyTorch distributed mode)
#   - (multi) TPUs
#   - fp16 (mixed-precision) or fp32 (normal precision)
#
# To run it in each of these various modes, follow the instructions
# in the readme for examples:
# https://github.com/huggingface/accelerate/tree/main/examples
#
########################################################################


MAX_GPU_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 32


def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):
    """
    Creates a set of `DataLoader`s for the `glue` dataset,
    using "bert-base-cased" as the tokenizer.

    Args:
        accelerator (`Accelerator`):
            An `Accelerator` object
        batch_size (`int`, *optional*):
            The batch size for the train and validation DataLoaders.
    """
    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    datasets = load_dataset("glue", "mrpc")

    def tokenize_function(examples):
        # max_length=None => use the model max length (it's actually the default)
        outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
        return outputs

    # Apply the method we just defined to all the examples in all the splits of the dataset
    # starting with the main process first:
    with accelerator.main_process_first():
        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            remove_columns=["idx", "sentence1", "sentence2"],
        )

    # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
    # transformers library
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

    def collate_fn(examples):
        # For Torchxla, it's best to pad everything to the same length or training will be very slow.
        max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
        # When using mixed precision we want round multiples of 8/16
        if accelerator.mixed_precision == "fp8":
            pad_to_multiple_of = 16
        elif accelerator.mixed_precision != "no":
            pad_to_multiple_of = 8
        else:
            pad_to_multiple_of = None

        return tokenizer.pad(
            examples,
            padding="longest",
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors="pt",
        )

    # Instantiate dataloaders.
    train_dataloader = DataLoader(
        tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size, drop_last=True
    )
    eval_dataloader = DataLoader(
        tokenized_datasets["validation"],
        shuffle=False,
        collate_fn=collate_fn,
        batch_size=EVAL_BATCH_SIZE,
        drop_last=(accelerator.mixed_precision == "fp8"),
    )

    return train_dataloader, eval_dataloader


def training_function(config, args):
    # Initialize accelerator
    accelerator = Accelerator(cpu=args.cpu, mixed_precision=args.mixed_precision)
    # Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
    lr = config["lr"]
    num_epochs = int(config["num_epochs"])
    seed = int(config["seed"])
    batch_size = int(config["batch_size"])

    metric = evaluate.load("glue", "mrpc")

    # If the batch size is too big we use gradient accumulation
    gradient_accumulation_steps = 1
    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.XLA:
        gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
        batch_size = MAX_GPU_BATCH_SIZE

    set_seed(seed)
    train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)
    # Instantiate the model (we build the model here so that the seed also control new weights initialization)
    model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)

    # We could avoid this line since the accelerator is set with `device_placement=True` (default value).
    # Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
    # creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
    model = model.to(accelerator.device)
    # Instantiate optimizer
    optimizer = AdamW(params=model.parameters(), lr=lr)

    # Instantiate scheduler
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=100,
        num_training_steps=(len(train_dataloader) * num_epochs) // gradient_accumulation_steps,
    )

    # Prepare everything
    # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
    # prepare method.

    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
    )

    # Now we train the model
    for epoch in range(num_epochs):
        model.train()
        for step, batch in enumerate(train_dataloader):
            # We could avoid this line since we set the accelerator with `device_placement=True`.
            batch.to(accelerator.device)
            outputs = model(**batch)
            loss = outputs.loss
            loss = loss / gradient_accumulation_steps
            accelerator.backward(loss)
            if step % gradient_accumulation_steps == 0:
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()

        model.eval()
        for step, batch in enumerate(eval_dataloader):
            # We could avoid this line since we set the accelerator with `device_placement=True`.
            batch.to(accelerator.device)
            with torch.no_grad():
                outputs = model(**batch)
            predictions = outputs.logits.argmax(dim=-1)
            predictions, references = accelerator.gather_for_metrics((predictions, batch["labels"]))
            print(f"=====  {predictions}")
            metric.add_batch(
                predictions=predictions,
                references=references,
            )

        eval_metric = metric.compute()
        # Use accelerator.print to print only on the main process.
        accelerator.print(f"epoch {epoch}:", eval_metric)
    accelerator.end_training()


def main():
    parser = argparse.ArgumentParser(description="Simple example of training script.")
    parser.add_argument(
        "--mixed_precision",
        type=str,
        default=None,
        choices=["no", "fp16", "bf16", "fp8"],
        help="Whether to use mixed precision. Choose"
        "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
        "and an Nvidia Ampere GPU.",
    )
    parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
    args = parser.parse_args()
    config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16}
    training_function(config, args)


if __name__ == "__main__":
    main()


================================================
FILE: examples/requirements.txt
================================================
accelerate # used to be installed in Amazon SageMaker environment
evaluate
datasets
schedulefree
huggingface_hub>=0.20.0


================================================
FILE: examples/slurm/fsdp_config.yaml
================================================
distributed_type: FSDP
fsdp_config:
  fsdp_activation_checkpointing: false
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_backward_prefetch: BACKWARD_PRE
  fsdp_cpu_ram_efficient_loading: true
  fsdp_forward_prefetch: false
  fsdp_offload_params: false
  fsdp_sharding_strategy: FULL_SHARD
  fsdp_state_dict_type: SHARDED_STATE_DICT
  fsdp_sync_module_states: true
  fsdp_use_orig_params: true


================================================
FILE: examples/slurm/submit_multicpu.sh
================================================
#!/bin/bash -l

#SBATCH --job-name=multicpu
#SBATCH --nodes=2                       # number of Nodes
#SBATCH --ntasks-per-node=1             # number of MP tasks
#SBATCH --exclusive
#SBATCH --output=O-%x.%j
#SBATCH --error=E-%x.%j

######################
### Set environment ###
######################
source activateEnvironment.sh

######################
#### Set network #####
######################
head_node_ip=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
######################

# Setup env variables for distributed jobs
export MASTER_PORT="${MASTER_PORT:-29555 }"
echo "head_node_ip=${head_node_ip}"
echo "MASTER_PORT=${MASTER_PORT}"

INSTANCES_PER_NODE="${INSTANCES_PER_NODE:-1}"

if [[ $SLURM_NNODES == 1 ]] && [[ $INSTANCES_PER_NODE == 1 ]]; then
  export CCL_WORKER_COUNT=0
  LAUNCHER=""
else
  # Setup env variables for distributed jobs
  export CCL_WORKER_COUNT="${CCL_WORKER_COUNT:-2}"  
  echo "CCL_WORKER_COUNT=${CCL_WORKER_COUNT}"

  # Write hostfile
  HOSTFILE_PATH=hostfile
  scontrol show hostname $SLURM_JOB_NODELIST | perl -ne 'chomb; print "$_"x1'> ${HOSTFILE_PATH}

  export LAUNCHER="accelerate launch \
    --num_processes $((SLURM_NNODES * ${INSTANCES_PER_NODE})) \
    --num_machines $SLURM_NNODES \
    --rdzv_backend c10d \
    --main_process_ip $head_node_ip \
    --main_process_port $MASTER_PORT \
    --mpirun_hostfile $HOSTFILE_PATH
fi

# This step is necessary because accelerate launch does not handle multiline arguments properly
export ACCELERATE_DIR="${ACCELERATE_DIR:-/accelerate}"
export SCRIPT="${ACCELERATE_DIR}/examples/complete_nlp_example.py"
export SCRIPT_ARGS=" \
    --cpu \
    --output_dir ${ACCELERATE_DIR}/examples/output \
    "
    
# This step is necessary because accelerate launch does not handle multiline arguments properly
export CMD="$LAUNCHER $SCRIPT $SCRIPT_ARGS" 
# Print the command
echo $CMD
echo ""

# Run the command
eval $CMD


================================================
FILE: examples/slurm/submit_multigpu.sh
================================================
#!/bin/bash

#SBATCH --job-name=multigpu
#SBATCH -D .
#SBATCH --output=O-%x.%j
#SBATCH --error=E-%x.%j
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1         # number of MP tasks
#SBATCH --gres=gpu:4                # number of GPUs per node
#SBATCH --cpus-per-task=160         # number of cores per tasks
#SBATCH --time=01:59:00             # maximum execution time (HH:MM:SS)

######################
### Set environment ###
######################
source activateEnvironment.sh
export GPUS_PER_NODE=4
######################

export ACCELERATE_DIR="${ACCELERATE_DIR:-/accelerate}"
export SCRIPT="${ACCELERATE_DIR}/examples/complete_nlp_example.py"
export SCRIPT_ARGS=" \
    --mixed_precision fp16 \
    --output_dir ${ACCELERATE_DIR}/examples/output \
    --with_tracking \
    "

accelerate launch --num_processes $GPUS_PER_NODE $SCRIPT $SCRIPT_ARGS

================================================
FILE: examples/slurm/submit_multinode.sh
================================================
#!/bin/bash

#SBATCH --job-name=multinode
#SBATCH -D .
#SBATCH --output=O-%x.%j
#SBATCH --error=E-%x.%j
#SBATCH --nodes=4                   # number of nodes
#SBATCH --ntasks-per-node=1         # number of MP tasks
#SBATCH --gres=gpu:4                # number of GPUs per node
#SBATCH --cpus-per-task=160         # number of cores per tasks
#SBATCH --time=01:59:00             # maximum execution time (HH:MM:SS)

######################
### Set environment ###
######################
source activateEnvironment.sh
export GPUS_PER_NODE=4
######################

######################
#### Set network #####
######################
head_node_ip=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
######################

export LAUNCHER="accelerate launch \
    --num_processes $((SLURM_NNODES * GPUS_PER_NODE)) \
    --num_machines $SLURM_NNODES \
    --rdzv_backend c10d \
    --main_process_ip $head_node_ip \
    --main_process_port 29500 \
    "
export ACCELERATE_DIR="${ACCELERATE_DIR:-/accelerate}"
export SCRIPT="${ACCELERATE_DIR}/examples/complete_nlp_example.py"
export SCRIPT_ARGS=" \
    --mixed_precision fp16 \
    --output_dir ${ACCELERATE_DIR}/examples/output \
    "
    
# This step is necessary because accelerate launch does not handle multiline arguments properly
export CMD="$LAUNCHER $SCRIPT $SCRIPT_ARGS" 
srun $CMD


================================================
FILE: examples/slurm/submit_multinode_fsdp.sh
================================================
#!/bin/bash

#SBATCH --job-name=multinode
#SBATCH -D .
#SBATCH --output=O-%x.%j
#SBATCH --error=E-%x.%j
#SBATCH --nodes=4                   # number of nodes
#SBATCH --ntasks-per-node=1         # number of MP tasks
#SBATCH --gres=gpu:4                # number of GPUs per node
#SBATCH --cpus-per-task=160         # number of cores per tasks
#SBATCH --time=01:59:00             # maximum execution time (HH:MM:SS)

######################
### Set environment ###
######################
source activateEnvironment.sh
export GPUS_PER_NODE=4
######################

######################
#### Set network #####
######################
head_node_ip=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
######################
export ACCELERATE_DIR="${ACCELERATE_DIR:-/accelerate}"

export LAUNCHER="accelerate launch \
    --config_file ${ACCELERATE_DIR}/examples/slurm/fsdp_config.yaml \
    --num_processes $((SLURM_NNODES * GPUS_PER_NODE)) \
    --num_machines $SLURM_NNODES \
    --rdzv_backend c10d \
    --main_process_ip $head_node_ip \
    --main_process_port 29500 \
    "
export SCRIPT="${ACCELERATE_DIR}/examples/complete_nlp_example.py"
export SCRIPT_ARGS=" \
    --mixed_precision fp16 \
    --output_dir ${ACCELERATE_DIR}/examples/output \
    "
    
# This step is necessary because accelerate launch does not handle multiline arguments properly
export CMD="$LAUNCHER $SCRIPT $SCRIPT_ARGS" 
srun $CMD

================================================
FILE: examples/torch_native_parallelism/README.md
================================================
## Torch Native Parallelism

With recent versions of Torch, there have been steady improvements in native parallelism using `DeviceMesh` and `DTensor`. 🤗 accelerate allows you to use these with our `ParallelismConfig` abstraction and/or `FullyShardedDataParallelPlugin(fsdp_version=2)`
This folder contains various examples of such use-cases: such as composing multiple parallelism strategies, low-bit training etc.

### ND Parallelism

With `ParallelismConfig`, you can use 🤗 accelerate to train models with n-dimensional parallelism. This builds on top of 🤗 transformers, which we utilize for tensor parallelism sharding.
Accelerate then takes care of everything else, such as data parallelism, FSDP or context parallelism.
Script `nd_parallel.py` showcases this. We enable you to configure 4 different parallel dimensions (for now 👀):
- dp_replicate_size: how many replicas of the model to create, each replica is trained on a different subset of the data and averaged at the end of each step, same as DDP in Torch
- dp_shard_size: across how many devices is the model sharded, this is utilizing FSDP2 to shard the model across devices, so each device has a different part of the model
- tp_size: how many devices to use for tensor parallelism, this is utilizing the tensor parallelism from 🤗 transformers
- cp_size: how many devices to use for context parallelism, this will also shard the model, optimizer and gradients using `FSDP2` across
the same group of devices, to further optimize memory usage (this comes with no slowdown)

For example, with 8 nodes, you can run the script as such:
```bash
accelerate launch --num-processes 8 nd_parallel.py \
    --dp-replicate-size 2 \
    --dp-shard-size 2 \
    --tp-size 2
```

> [!Tip]
> Only use TP intra-node - therefore max TP size you should need is 8. You can also use a lower size, as FSDP (`--dp-shard-size`) can be faster on smaller models with shorter sequence lengths. If you cannot fit your model into memory, utilize `--dp-shard-size` as much as you can. Afterwards, to scale up and utilize all your resources, use `--dp-replicate-size`. This is only a general guideline, you can (and should) experiment with different parallelism configurations to find the best one for your model and hardware. You can learn more about the general strategies for parallelism in our [blog](https://huggingface.co/blog/accelerate-nd-parallel), or if you really want to dive deep, read the [Ultra-Scale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook).


This feature is also fully integrated into 🤗 transformers `Trainer`. To use it, simply launch your script with path to your accelerate configuration file. You can see a minimal example of such script in `nd_parallel_trainer.py`.
We provide 2 pre-configured configuration files:

#### HSDP + TP (3D parallelism)

```bash
accelerate launch --config-file configs/tp_hsdp.yaml nd_parallel_trainer.py
```

#### Context parallelism (128k sequence length)

```bash
accelerate launch --config-file configs/cp.yaml nd_parallel_trainer.py --sequence-length=128000
```

  ### FSDP2 + ao Float8Linear

In file `fsdp2_fp8.py` we use `Float8Linear` from `ao` to train a model partially in FP8 precision. We utilize `AORecipeKwargs` to pass the `Float8LinearConfig` to the accelerator, 
which replaces the default `torch.nn.Linear` with `Float8Linear`. We also utilize `TorchDynamoPlugin` together with regional compilation to compile the model,
gaining even more speed and memory savings, as `ao` doesn't ship with any kernels by default, so we have to gain the performance from compiling the model.

Replacing linear layers with `Float8Linear` can greatly improve performance, if used correctly and on hardware that supports FP8 tensor cores. This highly depends on the model dimensions and sequence length used for training.
You can view the performance of `Float8Linear` as a function of matrix dimensions in [this document](https://github.com/pytorch/ao/blob/main/torchao/float8/README.md#performance). 

In our example, we use a 8B Llama3.1 model, which has a hidden dimension of 4096 and we train on sequence length of 8192. In the below images, we can see that this improves performance by ~25% compared to `bf16`, reaching ~10000 tokens per second, per device on 8x H100 GPUs, compared to ~8000 tokens per second using `bf16`, while loss function stays roughly the same. We can also see that the FLOPS rise by using FP8.

<div style="display: flex; gap: 25px;">
  <div style="text-align: center; width: 49%;">
    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/accelerate/examples/fsdp2/fp8_tps.png" alt="tps" style="width: 100%;">
    <p style="text-align: center; margin-top: 8px;">TPS per device, BF16 vs FP8</p>
  </div>
  <div style="text-align: center; width: 49%;">
    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/accelerate/examples/fsdp2/fp8_tflops.png" alt="tflops" style="width: 100%;">
    <p style="text-align: center; margin-top: 8px;">TFLOPS per device, BF16 vs FP8. We cannot really compare MFU as FP8 tensor cores are used as well.</p>
  </div>
  
  <div style="text-align: center; width: 49%;">  
    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/accelerate/examples/fsdp2/fp8_loss.png" alt="loss" style="width: 100%; max-width: 900px;">
    <p style="text-align: center; margin-top: 8px;">Loss curve, BF16 vs FP8, it's hard to see the difference as the curves mostly overlap</p>
  </div>
</div>

The figures above were generated on 8x H100 SXM GPUs, with 8192 sequence length and 1000 steps. To run the example, you can use the following command, where you can specify the precision to train in:

```bash
accelerate launch fsdp2_fp8.py --sequence-length 8192 --num-steps 1000 --log_with wandb --precision [fp8 | bf16]
```


================================================
FILE: examples/torch_native_parallelism/configs/cp.yaml
================================================
compute_environment: LOCAL_MACHINE
debug: false
distributed_type: FSDP
downcast_bf16: 'no'
enable_cpu_affinity: false
fsdp_config:
  fsdp_activation_checkpointing: true
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_cpu_ram_efficient_loading: false
  fsdp_offload_params: false
  fsdp_reshard_after_forward: true
  fsdp_state_dict_type: SHARDED_STATE_DICT
  fsdp_version: 2
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 8
parallelism_config:
  parallelism_config_cp_size: 8
  parallelism_config_dp_replicate_size: 1
  parallelism_config_dp_shard_size: 1
  parallelism_config_tp_size: 1
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false


================================================
FILE: examples/torch_native_parallelism/configs/tp_hsdp.yaml
================================================
compute_environment: LOCAL_MACHINE
debug: false
distributed_type: FSDP
downcast_bf16: 'no'
enable_cpu_affinity: false
fsdp_config:
  fsdp_activation_checkpointing: false
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_cpu_ram_efficient_loading: false
  fsdp_offload_params: false
  fsdp_reshard_after_forward: true
  fsdp_state_dict_type: SHARDED_STATE_DICT
  fsdp_version: 2
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 8
parallelism_config:
  parallelism_config_cp_size: 1
  parallelism_config_dp_replicate_size: 2
  parallelism_config_dp_shard_size: 2
  parallelism_config_tp_size: 2
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false


================================================
FILE: examples/torch_native_parallelism/fsdp2_fp8.py
================================================
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Minimal example of training with FP8 precision using FSDP2 via Accelerate.
This example demonstrates how to use torchao's Float8LinearConfig with Accelerate's AORecipeKwargs.
"""

import argparse

import torch
from torch.utils.data import DataLoader
from torchao.float8 import Float8LinearConfig
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer

from accelerate import Accelerator
from accelerate.utils import AORecipeKwargs, FullyShardedDataParallelPlugin, TorchDynamoPlugin, set_seed
from utils import PerformanceTracker, create_collate_fn, get_dataset, get_model_flops_per_token


WARMUP_STEPS = 10

MODEL_ID = "NousResearch/Hermes-3-Llama-3.1-8B"


def parse_args():
    parser = argparse.ArgumentParser()

    parser.add_argument("--sequence-length", type=int, default=8192, help="Sequence length for the dataset")
    parser.add_argument("--num-steps", type=int, default=1000, help="Number of steps to train for")
    parser.add_argument("--precision", type=str, default="fp8", choices=["fp8", "bf16"], help="Precision to train in")
    parser.add_argument("--log-with", type=str, default="wandb", help="Log with wandb or tensorboard")

    return parser.parse_args()


def main():
    """
    Main function to train the model.
    """
    set_seed(42)

    args = parse_args()

    fsdp2_plugin = FullyShardedDataParallelPlugin(
        fsdp_version=2,
        cpu_ram_efficient_loading=False,  # CPU RAM efficient loading CANNOT work with fp8 torchao
        auto_wrap_policy="transformer_based_wrap",
        transformer_cls_names_to_wrap=["LlamaDecoderLayer"],
    )
    fsdp2_plugin.set_mixed_precision(args.precision)

    dynamo_plugin = TorchDynamoPlugin(
        backend="inductor",
        use_regional_compilation=True,  # We use regional compilation to compile the model way faster
    )

    fp8_config = Float8LinearConfig(
        enable_fsdp_float8_all_gather=True,  # extra saving by gathering parameters in fp8 and upcasting after
    )

    kwargs = []
    if args.precision == "fp8":
        kwargs = [AORecipeKwargs(config=fp8_config)]

    accelerator = Accelerator(
        fsdp_plugin=fsdp2_plugin,
        dynamo_plugin=dynamo_plugin,
        kwargs_handlers=kwargs,
        log_with=args.log_with,
    )
    accelerator.init_trackers(
        project_name="FSDP2_torchao_fp8",
        config={"sequence_length": args.sequence_length, "num_steps": args.num_steps},
    )

    model = AutoModelForCausalLM.from_config(
        AutoConfig.from_pretrained(MODEL_ID, use_cache=False),
        torch_dtype=torch.bfloat16,
    )

    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
    dataset = get_dataset(tokenizer, args.sequence_length, accelerator)
    dataloader = DataLoader(dataset, batch_size=1, collate_fn=create_collate_fn())

    model, optimizer, dataloader = accelerator.prepare(model, optimizer, dataloader)
    accelerator.wait_for_everyone()

    model.train()

    total_num_steps = min(args.num_steps, len(dataloader))
    model_flops_per_token = get_model_flops_per_token(model, args.sequence_length)
    performance_tracker = PerformanceTracker(warmup_steps=5)

    for step, batch in enumerate(dataloader):
        if step >= total_num_steps:
            break

        outputs = model(**batch)
        loss = outputs.loss

        accelerator.backward(loss)
        optimizer.step()
        optimizer.zero_grad()
        metrics = performance_tracker.step(batch["input_ids"].shape[1], model_flops_per_token)

        print_msg = f"Step {step}/{total_num_steps}, Loss: {loss.item():.4f}"
        if "warmup_completed" in metrics:
            accelerator.print("Warm up completed! Starting training")
        elif metrics:
            print_msg += performance_tracker.get_print_message(metrics)

        if step % 10 == 0 or step == total_num_steps - 1:
            accelerator.print(print_msg)

        accelerator.log(metrics)

    accelerator.wait_for_everyone()
    accelerator.end_training()
    accelerator.print("Training completed!")


if __name__ == "__main__":
    main()


================================================
FILE: examples/torch_native_parallelism/nd_parallel.py
================================================
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Example of training with ND parallel using accelerate's ParallelismConfig
"""

import argparse
import warnings

import torch
import torch.distributed as dist
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM

from accelerate import Accelerator
from accelerate.parallelism_config import ParallelismConfig
from accelerate.utils import FullyShardedDataParallelPlugin, set_seed
from utils import (
    PerformanceTracker,
    create_collate_fn,
    get_dataset,
    get_model_flops_per_token,
    setup_tokenizer,
)


MODEL_ID = "NousResearch/Hermes-3-Llama-3.1-8B"


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--dp-replicate-size", type=int, default=1)
    parser.add_argument("--dp-shard-size", type=int, default=1)
    parser.add_argument("--tp-size", type=int, default=1)
    parser.add_argument("--cp-size", type=int, default=1)
    parser.add_argument("--sequence-length", type=int, default=1024)
    parser.add_argument("--num-steps", type=int, default=1000)
    parser.add_argument("--save-dir", type=str, default="./outputs")
    parser.add_argument("--checkpoint-frequency", type=int, default=100)
    parser.add_argument("--model-name", type=str, default=MODEL_ID)

    return parser.parse_args()


def forward(model, batch, optimizer, accelerator: Accelerator):
    batch["position_ids"] = torch.arange(0, batch["input_ids"].size(1), device=batch["input_ids"].device).unsqueeze(0)
    # We need both labels and shift_labels, as the loss computation in the model is hidden behind `if labels is not None`, but the loss computation
    # itself prioritzes shift_labels (if provided) which are the correct ones (due to labels being wrong if cp enabled)
    buffers = [batch["input_ids"], batch["shift_labels"], batch["labels"], batch["position_ids"]]
    with accelerator.maybe_context_parallel(
        buffers=buffers, buffer_seq_dims=[1, 1, 1, 1], no_restore_buffers=set(buffers)
    ):
        # To get the proper loss value, we need to average across devices that are participating in data parallel/context parallel training
        # As for DP we have a different batch on each device and for CP we essentially have a different part of sequences on each device
        # I.e. with causal modelling and seq_len 1024, this dimension becomes another batch dimension of sorts
        loss_reduce_grp = (
            accelerator.torch_device_mesh["dp_cp"].get_group()
            if accelerator.parallelism_config.dp_cp_dim_names
            else None
        )
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)
        optimizer.step()
        optimizer.zero_grad(set_to_none=False)
        dist.all_reduce(loss, op=dist.ReduceOp.AVG, group=loss_reduce_grp)

    return loss


def train(args):
    parallelism_config = ParallelismConfig(
        dp_replicate_size=args.dp_replicate_size,
        dp_shard_size=args.dp_shard_size,
        tp_size=args.tp_size,
        cp_size=args.cp_size,
    )

    # FSDP needs extra configuration, so we properly shard the model
    fsdp2_plugin = None
    if parallelism_config.dp_shard_enabled or parallelism_config.cp_enabled:
        fsdp2_plugin = FullyShardedDataParallelPlugin(
            fsdp_version=2,
            auto_wrap_policy="transformer_based_wrap",
            transformer_cls_names_to_wrap=["LlamaDecoderLayer"],
            state_dict_type="SHARDED_STATE_DICT",
        )

    accelerator = Accelerator(
        log_with=["wandb"], mixed_precision="bf16", parallelism_config=parallelism_config, fsdp_plugin=fsdp2_plugin
    )
    accelerator.init_trackers("nd_parallel_training")

    # If TP was enabled, we need to tell transformers to prepare the model for us
    model_kwargs = (
        {"tp_size": args.tp_size, "tp_plan": "auto", "device_mesh": accelerator.torch_device_mesh}
        if args.tp_size > 1
        else {}
    )
    model = AutoModelForCausalLM.from_pretrained(
        args.model_name,
        torch_dtype=torch.bfloat16,
        use_cache=False,
        **model_kwargs,
    )
    tokenizer = setup_tokenizer(args.model_name)
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-5)
    dataset = get_dataset(tokenizer, args.sequence_length, accelerator)
    dataloader = DataLoader(dataset, batch_size=1, collate_fn=create_collate_fn())

    model, optimizer, dataloader = accelerator.prepare(model, optimizer, dataloader)

    total_num_steps = min(args.num_steps, len(dataloader))
    performance_tracker = PerformanceTracker(warmup_steps=5)
    model_flops_per_token = get_model_flops_per_token(model, args.sequence_length)

    accelerator.print("Starting training...")
    for step, batch in enumerate(dataloader):
        if step >= total_num_steps:
            break

        loss = forward(model, batch, optimizer, accelerator)

        # We report TPS per device, so we divide by the number of devices in the non-data parallel dimension
        metrics = performance_tracker.step(
            batch["input_ids"].shape[1] / parallelism_config.non_data_parallel_size, model_flops_per_token
        )

        print_msg = f"Step {step}/{total_num_steps}, Loss: {loss.item():.4f}"
        if "warmup_completed" in metrics:
            accelerator.print("Warm up completed! Starting performance tracking...")
        elif metrics:
            print_msg += performance_tracker.get_print_message(metrics, with_memory=True)

        if step % 10 == 0 or step == total_num_steps - 1:
            accelerator.print(print_msg)

        if step % args.checkpoint_frequency == 0 and step > 0 and parallelism_config.dp_shard_enabled:
            accelerator.print(f"Saving checkpoint at step {step}...")
            accelerator.save_state(args.save_dir + f"/checkpoint-{step}")

        accelerator.log({"loss": loss.item()})

    accelerator.print("Training completed!")

    model.save_pretrained(args.save_dir + f"/{args.model_name}")
    accelerator.print(f"Model saved to {args.save_dir}/{args.model_name}")
    accelerator.end_training()


if __name__ == "__main__":
    set_seed(42)
    args = parse_args()
    if args.dp_shard_size == 1 and args.tp_size > 1:
        # We currently don't support saving with `save_state` when using only
        # tensor parallelism, fsdp must be enabled
        warnings.warn(
            "Accelerator.save_state() is not yet supported with pure tensor parallel training. Training will work, but intermediate checkpoints will not be saved."
        )
    train(args)


================================================
FILE: examples/torch_native_parallelism/nd_parallel_trainer.py
================================================
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments

from accelerate.utils import ParallelismConfig
from utils import get_dataset


MODEL_ID = "NousResearch/Hermes-3-Llama-3.1-8B"


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--sequence-length", type=int, default=4096)
    parser.add_argument("--checkpoint-frequency", type=int, default=100)
    parser.add_argument("--model-name", type=str, default=MODEL_ID)
    parser.add_argument("--save-dir", type=str, default=f"./accelerate-nd-parallel-{MODEL_ID.split('/')[-1]}")
    parser.add_argument("--device-type", type=str, default="auto")
    return parser.parse_args()


def main():
    # If ParallelismConfig is not initialized with __init__, it reads from env vars
    # which were set by using config
    pc = ParallelismConfig()
    args = parse_args()

    if args.device_type == "auto":
        args.device_type = torch.accelerator.current_accelerator().type

    model_kwargs = {}
    if pc.tp_enabled:
        model_kwargs["tp_plan"] = "auto"
        model_kwargs["device_mesh"] = pc.build_device_mesh(args.device_type)

    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
    model = AutoModelForCausalLM.from_pretrained(args.model_name, use_cache=False, **model_kwargs)

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    packed_dataset = get_dataset(tokenizer, args.sequence_length)

    training_args = TrainingArguments(
        output_dir=args.save_dir,
        parallelism_config=pc,
        num_train_epochs=1,
        per_device_train_batch_size=1,
        logging_steps=5,
        save_steps=args.checkpoint_frequency,
        learning_rate=5e-5,
        remove_unused_columns=False,
        bf16=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        processing_class=tokenizer,
        train_dataset=packed_dataset,
    )

    trainer.train()
    trainer.save_model()


if __name__ == "__main__":
    main()


================================================
FILE: examples/torch_native_parallelism/utils.py
================================================
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Common utilities for torch-native-parallelism examples.
"""

import time
from contextlib import nullcontext

import torch
from datasets import Dataset, load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

from accelerate import Accelerator


def get_dataset(tokenizer: AutoTokenizer, seq_len: int, accelerator: Accelerator | None = None) -> Dataset:
    """
    Load and prepare TinyStories dataset.

    Args:
        accelerator (Accelerator): Accelerate accelerator instance
        tokenizer (AutoTokenizer): Hugging Face tokenizer
        seq_len (int): Sequence length for the dataset

    Returns:
        Dataset: Packed dataset
    """
    processing_ctx = accelerator.main_process_first if accelerator else nullcontext
    raw_dataset = load_dataset("roneneldan/TinyStories", split="train[:50%]")

    def tokenize_function(examples):
        tokenized_batch = tokenizer(
            examples["text"],
            padding=False,
            truncation=True,
            max_length=seq_len,
            return_tensors=None,
        )
        tokenized_batch["labels"] = tokenized_batch["input_ids"].copy()
        return tokenized_batch

    with processing_ctx():
        tokenized_dataset = raw_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

    def create_packed_sequences(examples):
        all_tokens = []
        for input_ids in examples["input_ids"]:
            all_tokens.extend(input_ids)

        num_sequences = len(all_tokens) // (seq_len + 1)
        packed_input_ids = []
        packed_labels = []
        packed_position_ids = []

        for i in range(num_sequences):
            start_idx = i * (seq_len + 1)
            end_idx = start_idx + (seq_len + 1)
            full_sequence = all_tokens[start_idx:end_idx]
            packed_input_ids.append(full_sequence[:-1])
            packed_labels.append(full_sequence[1:])
            packed_position_ids.append(torch.arange(0, seq_len))

        return {
            "input_ids": packed_input_ids,
            "shift_labels": packed_labels,
            "position_ids": packed_position_ids,
            "labels": packed_labels,
        }

    with processing_ctx():
        packed_dataset = tokenized_dataset.map(
            create_packed_sequences,
            batched=True,
            remove_columns=tokenized_dataset.column_names,
            batch_size=1000,
        )

    return packed_dataset.shuffle(seed=42)


def get_model_flops_per_token(model: AutoModelForCausalLM, seq_len: int) -> float:
    """
    Get the number of flops per token for the model.

    Args:
        model (AutoModelForCausalLM): Model to get the flops for
        seq_len (int): Sequence length
    """
    cfg = model.config
    head_dim = cfg.hidden_size // cfg.num_attention_heads

    # MLP: 3 matmuls
    mlp_flops = 18 * cfg.hidden_size * cfg.intermediate_size

    # Attn (w/o dotproduct)
    attn_flops = 12 * head_dim * (cfg.num_attention_heads + cfg.num_key_value_heads)

    # attn (dotproduct) - this scales quadratically with sequence length
    attn_dotproduct_flops = 12 * cfg.num_attention_heads * head_dim * seq_len

    # we also ignore embeddings and layernorms, etc
    return (mlp_flops + attn_flops + attn_dotproduct_flops) * cfg.num_hidden_layers


def create_collate_fn():
    """Create a collate function for batching."""

    def collate_fn(batch):
        input_ids = torch.tensor([item["input_ids"] for item in batch], dtype=torch.long)
        shift_labels = torch.tensor([item["shift_labels"] for item in batch], dtype=torch.long)
        return {"input_ids": input_ids, "shift_labels": shift_labels, "labels": shift_labels}

    return collate_fn


class PerformanceTracker:
    """Track training performance metrics."""

    def __init__(self, warmup_steps: int = 10):
        self.warmup_steps = warmup_steps
        self.reset()

    def reset(self):
        """Reset all tracking variables."""
        self.start_time = None
        self.num_tokens = 0
        self.is_in_warmup = True
        self.step_count = 0

    def step(self, batch_tokens: int, model_flops_per_token: float | None = None) -> dict:
        """
        Update performance tracking with a new step.

        Args:
            batch_tokens (int): Number of tokens in current batch

        Returns:
            dict: Performance metrics if past warmup, empty dict otherwise
        """
        self.step_count += 1

        if self.step_count == self.warmup_steps:
            self.start_time = time.perf_counter()
            self.num_tokens = 0
            self.is_in_warmup = False
            return {"warmup_completed": True}

        if not self.is_in_warmup and self.start_time is not None:
            dct = {}
            self.num_tokens += batch_tokens
            total_time = time.perf_counter() - self.start_time
            steps_from_warmup = self.step_count - self.warmup_steps

            if total_time > 0 and steps_from_warmup > 0:
                memory_stats = gpu_memory_usage_all()
                dct = {
                    "tokens_per_second": self.num_tokens / total_time,
                    "steps_per_second": steps_from_warmup / total_time,
                    "total_tokens": self.num_tokens,
                    "total_time": total_time,
                    **memory_stats,
                }

            if model_flops_per_token is not None:
                flops = model_flops_per_token * self.num_tokens
                dct["tflops_per_device"] = flops / (total_time * 1e12)

            return dct

        return {}

    def get_print_message(self, metrics: dict, with_memory: bool = False) -> str:
        print_msg = f" | Average steps/s: {metrics['steps_per_second']:.2f} | Average tokens/s: {metrics['tokens_per_second']:.2f} | Average TFLOPS: {metrics['tflops_per_device']:.2f}\n"
        if with_memory:
            print_msg += (
                f"\tMemory (GB): active={metrics['peak_memory_active']:.1f}, "
                f"alloc={metrics['peak_memory_alloc']:.1f}, "
                f"reserved={metrics['peak_memory_reserved']:.1f}"
            )
        return print_msg


def setup_tokenizer(model_id: str) -> AutoTokenizer:
    """Setup tokenizer with proper padding token."""
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    return tokenizer


def gpu_memory_usage_all(device=0):
    device_type = torch.accelerator.current_accelerator().type
    device = torch.device(f"{device_type}:{device}")
    torch_device_module = getattr(torch, device_type, torch.cuda)
    _BYTES_IN_GIB = 1024**3
    peak_memory_active = torch_device_module.memory_stats().get("active_bytes.all.peak", 0) / _BYTES_IN_GIB
    peak_memory_alloc = torch_device_module.max_memory_allocated(device) / _BYTES_IN_GIB
    peak_memory_reserved = torch_device_module.max_memory_reserved(device) / _BYTES_IN_GIB
    memory_stats = {
        "peak_memory_active": peak_memory_active,
        "peak_memory_alloc": peak_memory_alloc,
        "peak_memory_reserved": peak_memory_reserved,
    }
    torch_device_module.reset_peak_memory_stats(device)

    return memory_stats


================================================
FILE: manim_animations/big_model_inference/stage_1.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from manim import *


class Stage1(Scene):
    def construct(self):
        mem = Rectangle(height=0.5,width=0.5)
        fill = Rectangle(height=0.46,width=0.46).set_stroke(width=0)

        cpu_left_col_base = [mem.copy() for i in range(6)]
        cpu_right_col_base = [mem.copy() for i in range(6)]
        cpu_left_col = VGroup(*cpu_left_col_base).arrange(UP, buff=0)
        cpu_right_col = VGroup(*cpu_right_col_base).arrange(UP, buff=0)
        cpu_rects = VGroup(cpu_left_col,cpu_right_col).arrange(RIGHT, buff=0)
        cpu_text = Text("CPU", font_size=24)
        cpu = Group(cpu_rects,cpu_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
        cpu.move_to([-2.5,-.5,0])
        self.add(cpu)

        gpu_base = [mem.copy() for i in range(1)]
        gpu_rect = VGroup(*gpu_base).arrange(UP,buff=0)
        gpu_text = Text("GPU", font_size=24)
        gpu = Group(gpu_rect,gpu_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
        gpu.align_to(cpu, DOWN)
        gpu.set_x(gpu.get_x() - 1)
        
        self.add(gpu)

        model_base = [mem.copy() for i in range(6)]
        model_rect = VGroup(*model_base).arrange(RIGHT,buff=0)

        model_text = Text("Model", font_size=24)
        model = Group(model_rect,model_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
        model.move_to([3, -1., 0])
        
        self.play(
            Create(cpu_left_col, run_time=1),
            Create(cpu_right_col, run_time=1),
            Create(gpu_rect, run_time=1),
        )

        step_1 = MarkupText(
            f"First, an empty model skeleton is loaded\ninto <span fgcolor='{YELLOW}'>memory</span> without using much RAM.", 
            font_size=24
        )

        key = Square(side_length=2.2)
        key.move_to([-5, 2, 0])

        key_text = MarkupText(
            f"<b>Key:</b>\n\n<span fgcolor='{YELLOW}'>●</span> Empty Model",
            font_size=18,
        )

        key_text.move_to([-5, 2.4, 0])


        step_1.move_to([2, 2, 0])
        self.play(
            Write(step_1, run_time=2.5),
            Write(key_text),
            Write(key)
        )

        self.add(model)
        

        cpu_targs = []
        first_animations = []
        second_animations = []
        for i,rect in enumerate(model_base):

            cpu_target = Rectangle(height=0.46,width=0.46).set_stroke(width=0.).set_fill(YELLOW, opacity=0.7)
            cpu_target.move_to(rect)
            cpu_target.generate_target()
            cpu_target.target.height = 0.46/4
            cpu_target.target.width = 0.46/3
            
            if i == 0:
                cpu_target.target.next_to(cpu_left_col_base[0].get_corner(DOWN+LEFT), buff=0.02, direction=UP)
                cpu_target.target.set_x(cpu_target.target.get_x()+0.1)
            elif i == 3:
                cpu_target.target.next_to(cpu_targs[0].target, direction=UP, buff=0.)
            else:
                cpu_target.target.next_to(cpu_targs[i-1].target, direction=RIGHT, buff=0.)
            cpu_targs.append(cpu_target)

            first_animations.append(rect.animate(run_time=0.5).set_stroke(YELLOW))
            second_animations.append(MoveToTarget(cpu_target, run_time=1.5))

        self.play(*first_animations)
        self.play(*second_animations)
                 

        self.wait()

================================================
FILE: manim_animations/big_model_inference/stage_2.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from manim import *

class Stage2(Scene):
    def construct(self):
        mem = Rectangle(height=0.5,width=0.5)
        fill = Rectangle(height=0.46,width=0.46).set_stroke(width=0)

        cpu_left_col_base = [mem.copy() for i in range(6)]
        cpu_right_col_base = [mem.copy() for i in range(6)]
        cpu_left_col = VGroup(*cpu_left_col_base).arrange(UP, buff=0)
        cpu_right_col = VGroup(*cpu_right_col_base).arrange(UP, buff=0)
        cpu_rects = VGroup(cpu_left_col,cpu_right_col).arrange(RIGHT, buff=0)
        cpu_text = Text("CPU", font_size=24)
        cpu = Group(cpu_rects,cpu_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
        cpu.move_to([-2.5,-.5,0])
        self.add(cpu)

        gpu_base = [mem.copy() for i in range(4)]
        gpu_rect = VGroup(*gpu_base).arrange(UP,buff=0)
        gpu_text = Text("GPU", font_size=24)
        gpu = Group(gpu_rect,gpu_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
        gpu.move_to([-1,-1,0])
        self.add(gpu)

        model_base = [mem.copy() for i in range(6)]
        model_rect = VGroup(*model_base).arrange(RIGHT,buff=0)

        model_text = Text("Model", font_size=24)
        model = Group(model_rect,model_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
        model.move_to([3, -1., 0])
        self.add(model)
        
        cpu_targs = []
        for i,rect in enumerate(model_base):
            rect.set_stroke(YELLOW)
            # target = fill.copy().set_fill(YELLOW, opacity=0.7)
            # target.move_to(rect)
            # self.add(target)

            cpu_target = Rectangle(height=0.46/4,width=0.46/3).set_stroke(width=0.).set_fill(YELLOW, opacity=0.7)
            
            if i == 0:
                cpu_target.next_to(cpu_left_col_base[0].get_corner(DOWN+LEFT), buff=0.02, direction=UP)
                cpu_target.set_x(cpu_target.get_x()+0.1)
            elif i == 3:
                cpu_target.next_to(cpu_targs[0], direction=UP, buff=0.)
            else:
                cpu_target.next_to(cpu_targs[i-1], direction=RIGHT, buff=0.)
            self.add(cpu_target)
            cpu_targs.append(cpu_target)

              
        checkpoint_base = [mem.copy() for i in range(6)]
        checkpoint_rect = VGroup(*checkpoint_base).arrange(RIGHT,buff=0)

        checkpoint_text = Text("Loaded Checkpoint", font_size=24)
        checkpoint = Group(checkpoint_rect,checkpoint_text).arrange(DOWN, aligned_edge=DOWN, buff=0.4)
        checkpoint.move_to([3, .5, 0])
            
        key = Square(side_length=2.2)
        key.move_to([-5, 2, 0])

        key_text = MarkupText(
            f"<b>Key:</b>\n\n<span fgcolor='{YELLOW}'>●</span> Empty Model",
            font_size=18,
        )

        key_text.move_to([-5, 2.4, 0])

        self.add(key_text, key)

        blue_text = MarkupText(
            f"<span fgcolor='{BLUE}'>●</span> Checkpoint",
            font_size=18,
        )

        blue_text.next_to(key_text, DOWN*2.4, aligned_edge=key_text.get_left())

        step_2 = MarkupText(
            f'Next, a <i><span fgcolor="{BLUE}">second</span></i> model is loaded into memory,\nwith the weights of a <span fgcolor="{BLUE}">single shard</span>.', 
            font_size=24
        )
        step_2.move_to([2, 2, 0])
        self.play(
            Write(step_2),
            Write(blue_text)
        )

        self.play(
            Write(checkpoint_text, run_time=1),
            Create(checkpoint_rect, run_time=1)
        )

        first_animations = []
        second_animations = []
        for i,rect in enumerate(checkpoint_base):
            target = fill.copy().set_fill(BLUE, opacity=0.7)
            target.move_to(rect)
            first_animations.append(GrowFromCenter(target, run_time=1))

            cpu_target = target.copy()
            cpu_target.generate_target()
            if i < 5:
                cpu_target.target.move_to(cpu_left_col_base[i+1])
            else:
                cpu_target.target.move_to(cpu_right_col_base[i-5])
            second_animations.append(MoveToTarget(cpu_target, run_time=1.5))
            
        self.play(*first_animations)
        self.play(*second_animations)
        self.wait()

================================================
FILE: manim_animations/big_model_inference/stage_3.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from manim import *

class Stage3(Scene):
    def construct(self):
        mem = Rectangle(height=0.5,width=0.5)
        meta_mem = Rectangle(height=0.25,width=0.25)
        fill = Rectangle(height=0.46,width=0.46).set_stroke(width=0)

        cpu_left_col_base = [mem.copy() for i in range(6)]
        cpu_right_col_base = [mem.copy() for i in range(6)]
        cpu_left_col = VGroup(*cpu_left_col_base).arrange(UP, buff=0)
        cpu_right_col = VGroup(*cpu_right_col_base).arrange(UP, buff=0)
        cpu_rects = VGroup(cpu_left_col,cpu_right_col).arrange(RIGHT, buff=0)
        cpu_text = Text("CPU", font_size=24)
        cpu = Group(cpu_rects,cpu_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
        cpu.move_to([-2.5,-.5,0])
        self.add(cpu)

        gpu_base = [mem.copy() for i in range(4)]
        gpu_rect = VGroup(*gpu_base).arrange(UP,buff=0)
        gpu_text = Text("GPU", font_size=24)
        gpu = Group(gpu_rect,gpu_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
        gpu.move_to([-1,-1,0])
        self.add(gpu)

        model_base = [mem.copy() for i in range(6)]
        model_rect = VGroup(*model_base).arrange(RIGHT,buff=0)

        model_text = Text("Model", font_size=24)
        model = Group(model_rect,model_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
        model.move_to([3, -1., 0])
        self.add(model)

        model_arr = []
        model_cpu_arr = []
        model_meta_arr = []
        
        for i,rect in enumerate(model_base):
            rect.set_stroke(YELLOW)

            cpu_target = Rectangle(height=0.46/4,width=0.46/3).set_stroke(width=0.).set_fill(YELLOW, opacity=0.7)
            
            if i == 0:
                cpu_target.next_to(cpu_left_col_base[0].get_corner(DOWN+LEFT), buff=0.02, direction=UP)
                cpu_target.set_x(cpu_target.get_x()+0.1)
            elif i == 3:
                cpu_target.next_to(model_cpu_arr[0], direction=UP, buff=0.)
            else:
                cpu_target.next_to(model_cpu_arr[i-1], direction=RIGHT, buff=0.)
            self.add(cpu_target)
            model_cpu_arr.append(cpu_target)

        self.add(*model_arr, *model_cpu_arr, *model_meta_arr)

        checkpoint_base = [mem.copy() for i in range(6)]
        checkpoint_rect = VGroup(*checkpoint_base).arrange(RIGHT,buff=0)

        checkpoint_text = Text("Loaded Checkpoint", font_size=24)
        checkpoint = Group(checkpoint_rect,checkpoint_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
        checkpoint.move_to([3, .5, 0])
            
        self.add(checkpoint)

        ckpt_arr = []
        ckpt_cpu_arr = []

        for i,rect in enumerate(checkpoint_base):
            target = fill.copy().set_fill(BLUE, opacity=0.7)
            target.move_to(rect)
            ckpt_arr.append(target)

            cpu_target = target.copy()
            if i < 5:
                cpu_target.move_to(cpu_left_col_base[i+1])
            else:
                cpu_target.move_to(cpu_right_col_base[i-5])
            ckpt_cpu_arr.append(cpu_target)
        self.add(*ckpt_arr, *ckpt_cpu_arr)

        key = Square(side_length=2.2)
        key.move_to([-5, 2, 0])

        key_text = MarkupText(
            f"<b>Key:</b>\n\n<span fgcolor='{YELLOW}'>●</span> Empty Model",
            font_size=18,
        )

        key_text.move_to([-5, 2.4, 0])

        self.add(key_text, key)

        blue_text = MarkupText(
            f"<span fgcolor='{BLUE}'>●</span> Checkpoint",
            font_size=18,
        )

        blue_text.next_to(key_text, DOWN*2.4, aligned_edge=key_text.get_left())
        self.add(blue_text)

        step_3 = MarkupText(
            f'Based on the passed in configuration, weights are stored in\na variety of np.memmaps on disk or to a particular device.', 
            font_size=24
        )
        step_3.move_to([2, 2, 0])

        disk_left_col_base = [meta_mem.copy() for i in range(6)]
        disk_right_col_base = [meta_mem.copy() for i in range(6)]
        disk_left_col = VGroup(*disk_left_col_base).arrange(UP, buff=0)
        disk_right_col = VGroup(*disk_right_col_base).arrange(UP, buff=0)
        disk_rects = VGroup(disk_left_col,disk_right_col).arrange(RIGHT, buff=0)
        disk_text = Text("Disk", font_size=24)
        disk = Group(disk_rects,disk_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
        disk.move_to([-4.,-1.25,0])
        self.play(
            Write(step_3, run_time=3),
            Write(disk_text, run_time=1),
            Create(disk_rects, run_time=1)
        )

        animations = []
        for i,rect in enumerate(ckpt_cpu_arr):
            target = rect.copy()
            target.generate_target()
            target.target.move_to(disk_left_col_base[i]).scale(0.5)
            animations.append(MoveToTarget(target, run_time=1.5))
        self.play(*animations)

        self.play(FadeOut(step_3))

        step_4 = MarkupText(
            f'Then, the checkpoint is removed from memory\nthrough garbage collection.', 
            font_size=24
        )
        step_4.move_to([2, 2, 0])

        self.play(
            Write(step_4, run_time=3)
        )

        self.play(
            FadeOut(checkpoint_rect, checkpoint_text, *ckpt_arr, *ckpt_cpu_arr),
        )

        self.wait()      

================================================
FILE: manim_animations/big_model_inference/stage_4.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from manim import *

class Stage4(Scene):
    def construct(self):
        mem = Rectangle(height=0.5,width=0.5)
        fill = Rectangle(height=0.46,width=0.46).set_stroke(width=0)
        meta_mem = Rectangle(height=0.25,width=0.25)

        cpu_left_col_base = [mem.copy() for i in range(6)]
        cpu_right_col_base = [mem.copy() for i in range(6)]
        cpu_left_col = VGroup(*cpu_left_col_base).arrange(UP, buff=0)
        cpu_right_col = VGroup(*cpu_right_col_base).arrange(UP, buff=0)
        cpu_rects = VGroup(cpu_left_col,cpu_right_col).arrange(RIGHT, buff=0)
        cpu_text = Text("CPU", font_size=24)
        cpu = Group(cpu_rects,cpu_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
        cpu.move_to([-2.5,-.5,0])
        self.add(cpu)

        gpu_base = [mem.copy() for i in range(4)]
        gpu_rect = VGroup(*gpu_base).arrange(UP,buff=0)
        gpu_text = Text("GPU", font_size=24)
        gpu = Group(gpu_rect,gpu_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
        gpu.move_to([-1,-1,0])
        self.add(gpu)

        model_base = [mem.copy() for i in range(6)]
        model_rect = VGroup(*model_base).arrange(RIGHT,buff=0)

        model_text = Text("Model", font_size=24)
        model = Group(model_rect,model_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
        model.move_to([3, -1., 0])
        self.add(model)

        model_cpu_arr = []
        model_meta_arr = []
        
        for i,rect in enumerate(model_base):
            rect.set_stroke(YELLOW)

            cpu_target = Rectangle(height=0.46/4,width=0.46/3).set_stroke(width=0.).set_fill(YELLOW, opacity=0.7)
            
            if i == 0:
                cpu_target.next_to(cpu_left_col_base[0].get_corner(DOWN+LEFT), buff=0.02, direction=UP)
                cpu_target.set_x(cpu_target.get_x()+0.1)
            elif i == 3:
                cpu_target.next_to(model_cpu_arr[0], direction=UP, buff=0.)
            else:
                cpu_target.next_to(model_cpu_arr[i-1], direction=RIGHT, buff=0.)
            self.add(cpu_target)
            model_cpu_arr.append(cpu_target)

        self.add(*model_cpu_arr, *model_meta_arr)

        disk_left_col_base = [meta_mem.copy() for i in range(6)]
        disk_right_col_base = [meta_mem.copy() for i in range(6)]
        disk_left_col = VGroup(*disk_left_col_base).arrange(UP, buff=0)
        disk_right_col = VGroup(*disk_right_col_base).arrange(UP, buff=0)
        disk_rects = VGroup(disk_left_col,disk_right_col).arrange(RIGHT, buff=0)
        disk_text = Text("Disk", font_size=24)
        disk = Group(disk_rects,disk_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
        disk.move_to([-4.,-1.25,0])
        self.add(disk_text, disk_rects)

        cpu_disk_arr = []

        for i in range(6):
            target = fill.copy().set_fill(BLUE, opacity=0.8)
            target.move_to(disk_left_col_base[i]).scale(0.5)
            cpu_disk_arr.append(target)

        self.add(*cpu_disk_arr)

        key = Square(side_length=2.2)
        key.move_to([-5, 2, 0])

        key_text = MarkupText(
            f"<b>Key:</b>\n\n<span fgcolor='{YELLOW}'>●</span> Empty Model",
            font_size=18,
        )

        key_text.move_to([-5, 2.4, 0])

        self.add(key_text, key)

        blue_text = MarkupText(
            f"<span fgcolor='{BLUE}'>●</span> Checkpoint",
            font_size=18,
        )

        blue_text.next_to(key_text, DOWN*2.4, aligned_edge=key_text.get_left())
        self.add(blue_text)

        step_5 = MarkupText(
            f'The offloaded weights are all sent to the CPU.', 
            font_size=24
        )
        step_5.move_to([2, 2, 0])

        self.play(Write(step_5, run_time=3))

        for i in range(6):
            rect = cpu_disk_arr[i]
            cp2 = rect.copy().set_fill(BLUE, opacity=0.8).scale(2.0)
            cp2.generate_target()
            cp2.target.move_to(model_base[i])

            if i == 0:
                rect.set_fill(BLUE, opacity=0.8)
                rect.generate_target()
                rect.target.move_to(cpu_left_col_base[0]).scale(2.0)
                
                self.remove(*model_meta_arr, 
                    *model_cpu_arr,
                )

            else:
                rect.generate_target()
                rect.target.move_to(cpu_left_col_base[i]).scale(2.0)
            self.play(
                MoveToTarget(rect),
                MoveToTarget(cp2),
                model_base[i].animate.set_stroke(WHITE)
            )
        self.play(FadeOut(step_5))

        step_5 = MarkupText(
            f'Finally, hooks are added to each weight in the model\nto transfer the weights from CPU to GPU\n\t\tand back when needed.', 
            font_size=24
        )
        step_5.move_to([2, 2, 0])

        self.play(Write(step_5, run_time=3))

        arrows = []
        animations = []
        for i in range(6):
            a = Arrow(start=UP, end=DOWN, color=RED, buff=.5)
            a.next_to(model_base[i].get_left(), UP, buff=0.2)
            arrows.append(a)
            animations.append(Write(a))
        self.play(*animations)
        self.wait()  

================================================
FILE: manim_animations/big_model_inference/stage_5.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from manim import *

class Stage5(Scene):
    def construct(self):
        mem = Rectangle(height=0.5,width=0.5)
        fill = Rectangle(height=0.46,width=0.46).set_stroke(width=0)

        meta_mem = Rectangle(height=0.25,width=0.25)

        cpu_left_col_base = [mem.copy() for i in range(6)]
        cpu_right_col_base = [mem.copy() for i in range(6)]
        cpu_left_col = VGroup(*cpu_left_col_base).arrange(UP, buff=0)
        cpu_right_col = VGroup(*cpu_right_col_base).arrange(UP, buff=0)
        cpu_rects = VGroup(cpu_left_col,cpu_right_col).arrange(RIGHT, buff=0)
        cpu_text = Text("CPU", font_size=24)
        cpu = Group(cpu_rects,cpu_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
        cpu.move_to([-2.5,-.5,0])
        self.add(cpu)

        gpu_base = [mem.copy() for i in range(4)]
        gpu_rect = VGroup(*gpu_base).arrange(UP,buff=0)
        gpu_text = Text("GPU", font_size=24)
        gpu = Group(gpu_rect,gpu_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
        gpu.move_to([-1,-1,0])
        self.add(gpu)

        model_base = [mem.copy() for i in range(6)]
        model_rect = VGroup(*model_base).arrange(RIGHT,buff=0)

        model_text = Text("Model", font_size=24)
        model = Group(model_rect,model_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
        model.move_to([3, -1., 0])
        self.add(model)

        model_arr = []
        model_cpu_arr = []
        
        for i,rect in enumerate(model_base):
            target = fill.copy().set_fill(BLUE, opacity=0.8)
            target.move_to(rect)
            model_arr.append(target)

            cpu_target = Rectangle(height=0.46,width=0.46).set_stroke(width=0.).set_fill(BLUE, opacity=0.8)
            cpu_target.move_to(cpu_left_col_base[i])
            model_cpu_arr.append(cpu_target)

        self.add(*model_arr, *model_cpu_arr)

        disk_left_col_base = [meta_mem.copy() for i in range(6)]
        disk_right_col_base = [meta_mem.copy() for i in range(6)]
        disk_left_col = VGroup(*disk_left_col_base).arrange(UP, buff=0)
        disk_right_col = VGroup(*disk_right_col_base).arrange(UP, buff=0)
        disk_rects = VGroup(disk_left_col,disk_right_col).arrange(RIGHT, buff=0)
        disk_text = Text("Disk", font_size=24)
        disk = Group(disk_rects,disk_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
        disk.move_to([-4,-1.25,0])
        self.add(disk_text, disk_rects)

        key = Square(side_length=2.2)
        key.move_to([-5, 2, 0])

        key_text = MarkupText(
            f"<b>Key:</b>\n\n<span fgcolor='{YELLOW}'>●</span> Empty Model",
            font_size=18,
        )

        key_text.move_to([-5, 2.4, 0])

        self.add(key_text, key)

        blue_text = MarkupText(
            f"<span fgcolor='{BLUE}'>●</span> Checkpoint",
            font_size=18,
        )

        blue_text.next_to(key_text, DOWN*2.4, aligned_edge=key_text.get_left())
        self.add(blue_text)

        step_6 = MarkupText(
            f'Now watch as an input is passed through the model\nand how the memory is utilized and handled.', 
            font_size=24
        )
        step_6.move_to([2, 2, 0])

        self.play(Write(step_6))

        input = Square(0.3)
        input.set_fill(RED, opacity=1.)
        input.set_stroke(width=0.)
        input.next_to(model_base[0], LEFT, buff=.5)

        self.play(Write(input))

        input.generate_target()
        input.target.next_to(model_arr[0], direction=LEFT, buff=0.02)
        self.play(MoveToTarget(input))

        self.play(FadeOut(step_6))


        a = Arrow(start=UP, end=DOWN, color=RED, buff=.5)
        a.next_to(model_arr[0].get_left(), UP, buff=0.2)

        model_cpu_arr[0].generate_target()
        model_cpu_arr[0].target.move_to(gpu_rect[0])

        step_7 = MarkupText(
            f'As the input reaches a layer, the hook triggers\nand weights are moved from the CPU\nto the GPU and back.', 
            font_size=24
        )
        step_7.move_to([2, 2, 0])

        self.play(Write(step_7, run_time=3))

        circ_kwargs = {"run_time":1, "fade_in":True, "fade_out":True, "buff":0.02}

        self.play(
            Write(a), 
            Circumscribe(model_arr[0], color=ORANGE, **circ_kwargs),
            Circumscribe(model_cpu_arr[0], color=ORANGE, **circ_kwargs),
            Circumscribe(gpu_rect[0], color=ORANGE, **circ_kwargs),
        )
        self.play(
            MoveToTarget(model_cpu_arr[0])
        )

        a_c = a.copy()
        for i in range(6):
            a_c.next_to(model_arr[i].get_right()+0.02, UP, buff=0.2)

            input.generate_target()
            input.target.move_to(model_arr[i].get_right()+0.02)

            grp = AnimationGroup(
                FadeOut(a, run_time=.5), 
                MoveToTarget(input, run_time=.5), 
                FadeIn(a_c, run_time=.5),
                lag_ratio=0.2
            )

            self.play(grp)


            model_cpu_arr[i].generate_target()
            model_cpu_arr[i].target.move_to(cpu_left_col_base[i])


            if i < 5:
                model_cpu_arr[i+1].generate_target()
                model_cpu_arr[i+1].target.move_to(gpu_rect[0])
                if i >= 1:
                    circ_kwargs["run_time"] = .7

                self.play(
                    Circumscribe(model_arr[i], **circ_kwargs),
                    Circumscribe(cpu_left_col_base[i], **circ_kwargs),
                    Circumscribe(cpu_left_col_base[i+1], color=ORANGE, **circ_kwargs),                    
                    Circumscribe(gpu_rect[0], color=ORANGE, **circ_kwargs),
                    Circumscribe(model_arr[i+1], color=ORANGE, **circ_kwargs),
                )
                if i < 1:
                    self.play(
                        MoveToTarget(model_cpu_arr[i]), 
                        MoveToTarget(model_cpu_arr[i+1]),
                    )
                else:
                    self.play(
                        MoveToTarget(model_cpu_arr[i], run_time=.7), 
                        MoveToTarget(model_cpu_arr[i+1], run_time=.7),
                    )
            else:
                model_cpu_arr[i].generate_target()
                model_cpu_arr[i].target.move_to(cpu_left_col_base[-1])
                input.generate_target()
                input.target.next_to(model_arr[-1].get_right(), RIGHT+0.02, buff=0.2)

                self.play(
                    Circumscribe(model_arr[-1], color=ORANGE, **circ_kwargs),
                    Circumscribe(cpu_left_col_base[-1], color=ORANGE, **circ_kwargs),
                    Circumscribe(gpu_rect[0], color=ORANGE, **circ_kwargs),
                )

                self.play(
                    MoveToTarget(model_cpu_arr[i])
                )

            a = a_c
            a_c = a_c.copy()

        input.generate_target()
        input.target.next_to(model_base[-1], RIGHT+0.02, buff=.5)
        self.play(
            FadeOut(step_7),
            FadeOut(a, run_time=.5), 
        )

        step_8 = MarkupText(
            f'Inference on a model too large for GPU memory\nis successfully completed.', font_size=24
        )
        step_8.move_to([2, 2, 0])

        self.play(
            Write(step_8, run_time=3),
            MoveToTarget(input)
        )

        self.wait()

================================================
FILE: manim_animations/dataloaders/stage_0.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from manim import *


class Stage0(Scene):
    def construct(self):
        mascot = ImageMobject("mascot_bookie.png")
        mascot.scale(.35)
        mascot.move_to([-3.75,-1,0])
        text = Paragraph(
            "Distributed Training,\nHugging Face Accelerate,\nand PyTorch DataLoaders\n\nHow do they all interact?", 
            font_size=36,
            line_spacing=1,
            alignment="center",
            weight=BOLD,
        )
        text.move_to([1.75,.5,0])
        self.add(mascot)
        self.add(text)

================================================
FILE: manim_animations/dataloaders/stage_1.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from manim import *

class Stage01(Scene):
    def construct(self):
        mascot = ImageMobject("mascot_bookie.png")
        mascot.scale(.35)
        mascot.move_to([-3.75,-1,0])
        text = Paragraph(
            "Distributed Training,\nHugging Face Accelerate,\nand PyTorch DataLoaders\n\nHow do they all interact?", 
            font_size=36,
            line_spacing=1,
            alignment="center",
            weight=BOLD,
        )
        text.move_to([1.75,.5,0])
        self.add(mascot)
        self.add(text)

================================================
FILE: manim_animations/dataloaders/stage_2.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from manim import *


class Stage2(Scene):
    def construct(self):
        # The dataset items
        fill = Rectangle(height=0.46,width=0.46).set_stroke(width=0)
        columns = [
            VGroup(*[Rectangle(height=0.25,width=0.25,color="green") for i in range(8)]).arrange(RIGHT,buff=0)
            for j in range(4)
        ]
        dataset_recs = VGroup(*columns).arrange(UP, buff=0)
        dataset_text = Text("Dataset", font_size=24)
        dataset = Group(dataset_recs,dataset_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
        dataset.move_to([-2,0,0])
        self.add(dataset)
        
        code = Code(
            code="dataloader = DataLoader(...)\nfor batch in dataloader():\n\t...",
            tab_width=4,
            background="window",
            language="Python",
            font="Monospace",
            font_size=14,
            corner_radius=.2,
            insert_line_no=False,
            line_spacing=.75,
            style=Code.styles_list[1],
        )
        code.move_to([-3.5, 2.5, 0])
        self.add(code)

        # The dataloader itself
        dataloader = Group(
            Rectangle(color="red", height=2, width=2),
            Text("DataLoader", font_size=24)
        ).arrange(DOWN, buff=.5, aligned_edge=DOWN)

        sampler = Group(
            Rectangle(color="blue", height=1, width=1),
            Text("Sampler", font_size=12)
        ).arrange(DOWN, buff=.25, aligned_edge=DOWN)
        dataloader.move_to([1, 0, 0])
        sampler.move_to([.75,.25,0])
        self.add(dataloader)
        self.add(sampler)

        gpu_1 = Group(
            Rectangle(color="white", height=1, width=1),
            Text("GPU 1", font_size=12)
        ).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4, 2, 0])
        gpu_2 = Group(
            Rectangle(color="white", height=1, width=1),
            Text("GPU 2", font_size=12)
        ).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4, .5, 0])
        gpu_3 = Group(
            Rectangle(color="white", height=1, width=1),
            Text("GPU 3", font_size=12)
        ).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4, -1, 0])
        gpu_4 = Group(
            Rectangle(color="white", height=1, width=1),
            Text("GPU 4", font_size=12)
        ).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4, -2.5, 0])
        gpus = [gpu_1[0], gpu_2[0], gpu_3[0], gpu_4[0]]
        self.add(gpu_1, gpu_2, gpu_3, gpu_4)

        # Animate their existence
        self.play(
            Create(gpu_1[0], run_time=0.5),
            Create(gpu_2[0], run_time=0.5),
            Create(gpu_3[0], run_time=0.5),
            Create(gpu_4[0], run_time=0.5),
            Create(dataset_recs, run_time=1),
            Create(sampler[0], run_time=1),
            Create(dataloader[0], run_time=1)
        )

        step_1 = MarkupText(
            f"Without any special care, \nthe same data is sent though each sampler, \nand the same samples are spit out on each GPU",
            font_size=18
        )
        step_1.move_to([0, -2.5, 0])
        self.play(
            Write(step_1, run_time=4),
        )

        first_animations = []
        second_animations = []


        colors = ["BLUE_E", "DARK_BROWN", "GOLD_E", "GRAY_A"]
        current_color = colors[0]
        buff = 0
        lr_buff = .25
        old_target = None
        new_datasets = []
        for i,data in enumerate(dataset_recs[-1]):
            if i % 2 == 0:
                # current_color = colors[i//2]
                current_color = "BLUE_E"
            dataset_target = Rectangle(height=0.46/2,width=0.46/2).set_stroke(width=0.).set_fill(current_color, opacity=0.7)
            dataset_target.move_to(data)
            dataset_target.generate_target()
            aligned_edge = ORIGIN
            if i % 2 == 0:
                old_target = dataset_target.target
                buff -= .25
                aligned_edge = LEFT
                dataset_target.target.next_to(
                    sampler, buff=buff, direction=UP,
                    aligned_edge=LEFT
                )
            else:
                dataset_target.target.next_to(
                    old_target, direction=RIGHT, buff=0.01,
                )
            new_datasets.append(dataset_target)
            first_animations.append(data.animate(run_time=0.5).set_stroke(current_color))
            second_animations.append(MoveToTarget(dataset_target, run_time=1.5))
        self.play(*first_animations)
        self.play(*second_animations)
        self.wait()

        move_animation = []

        for j,gpu in enumerate(gpus):
            buff = 0
            for i,data in enumerate(new_datasets):
                if i % 2 == 0:
                    current_color = colors[i//2]
                if j != 3:
                    data = data.copy()
                data.generate_target()
                aligned_edge = ORIGIN
                if i % 2 == 0:
                    old_target = data.target
                    buff -= .25
                    aligned_edge = LEFT
                    data.target.next_to(
                        gpu, buff=buff, direction=UP,
                        aligned_edge=LEFT
                    )
                else:
                    data.target.next_to(
                        old_target, direction=RIGHT, buff=0.01,
                    )
                move_animation.append(MoveToTarget(data, run_time=1.5))


        self.play(*move_animation)

        self.remove(step_1)
        step_2 = MarkupText(
            f"This behavior is undesireable, because we want\neach GPU to see different data for efficient training.",
            font_size=18
        )
        step_2.move_to([0, -2.5, 0])

        self.play(
            Write(step_2, run_time=2.5),
        )
        self.wait()

================================================
FILE: manim_animations/dataloaders/stage_3.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from manim import *

class Stage3(Scene):
    def construct(self):
        step_1 = MarkupText(
            f"To combat this, Accelerate employs one of two different\nSampler wrapper methods depending on the scenario:",
            font_size=24
        )
        step_1.move_to([0, 1.5, 0])
        self.add(step_1)
        step_2 = MarkupText(
            f"1. Sharding the dataset before drawing:\n\t● <span fgcolor='{RED}'>IterableDatasetShard</span>\n\t● <span fgcolor='{RED}'>BatchSamplerShard</span>",
            font_size=24,
        ).next_to(step_1, direction=DOWN, aligned_edge=LEFT)
        self.add(step_2)
        step_3 = MarkupText(
            f"\n\n2. Splitting the batch after drawing:\n\t● <span fgcolor='{BLUE}'>DataLoaderDispatcher</span>",
            font_size=24,
        ).next_to(step_2, direction=DOWN, aligned_edge=LEFT)
        self.add(step_3)

================================================
FILE: manim_animations/dataloaders/stage_4.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from manim import *

class Stage4(Scene):
    def construct(self):

        step_1 = MarkupText(
            f"To understand the next part fully, let's define two terms,\n<span fgcolor='{RED}'>`batch_size`</span> and <span fgcolor='{BLUE}'>`global_batch_size`</span>:",
            font_size=18
        )
        step_1.move_to([0, 1.5, 0])
        # <span fgcolor='{YELLOW}'>●</span>
        step_2 = MarkupText(
            f"\n\n● <span fgcolor='{RED}'>`batch_size`</span>: \n\tThis will be defined as the batch size seen on a given\n\t*individual* GPU",
            font_size=18,
        ).next_to(step_1, direction=DOWN, aligned_edge=LEFT)

        step_3 = MarkupText(
            f"\n\n● <span fgcolor='{BLUE}'>`global_batch_size`</span>:\n\tThis will be defined as the *total* number of\n\tdifferent items seen in the dataset, across all GPUs",
            font_size=18,
        ).next_to(step_2, direction=DOWN, aligned_edge=LEFT)

        step_4 = MarkupText(
            f"\n\nSo if we have a dataset of 64 items, 8 GPUs, \nand a `batch_size` of 8, each *step* will go through\nthe entire dataset one time as 8*8=64",
            font_size=18,
        ).next_to(step_3, direction=DOWN, aligned_edge=LEFT)
        self.play(
            Write(step_1, run_time=4),
        )
        self.play(
            Write(step_2, run_time=4)
        )
        self.play(
            Write(step_3, run_time=4)
        )
        self.play(
            Write(step_4, run_time=6)
        )
        self.wait()

================================================
FILE: manim_animations/dataloaders/stage_5.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from manim import *

class Stage5(Scene):
    def construct(self):
        # The dataset items
        colors = ["BLUE_E", "DARK_BROWN", "GOLD_E", "GRAY_A"]
        fill = Rectangle(height=0.46,width=0.46).set_stroke(width=0)
        columns = [
            VGroup(*[Rectangle(height=0.25,width=0.25,color=colors[j]) for i in range(8)]).arrange(RIGHT,buff=0)
            for j in range(4)
        ]
        dataset_recs = VGroup(*columns).arrange(UP, buff=0)
        dataset_text = Text("Dataset", font_size=24)
        dataset = Group(dataset_recs,dataset_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
        dataset.move_to([-2,0,0])
        self.add(dataset)
        code = Code(
            code="# We enable this by default\naccelerator = Accelerator()\ndataloader = DataLoader(...)\ndataloader = accelerator.prepare(dataloader)\nfor batch in dataloader:\n\t...",
            tab_width=4,
            background="window",
            language="Python",
            font="Monospace",
            font_size=14,
            corner_radius=.2,
            insert_line_no=False,
            line_spacing=.75,
            style=Code.styles_list[1],
        )
        code.move_to([-3.5, 2.5, 0])
        self.add(code)

        # The dataloader itself

        sampler_1 = Group(
            Rectangle(color="blue", height=1, width=1),
            Text("Sampler GPU 1", font_size=12)
        ).arrange(DOWN, buff=.25, aligned_edge=DOWN)
        sampler_2 = Group(
            Rectangle(color="blue", height=1, width=1),
            Text("Sampler GPU 2", font_size=12)
        ).arrange(DOWN, buff=.25, aligned_edge=DOWN)
        sampler_3 = Group(
            Rectangle(color="blue", height=1, width=1),
            Text("Sampler GPU 3", font_size=12)
        ).arrange(DOWN, buff=.25, aligned_edge=DOWN)
        sampler_4 = Group(
            Rectangle(color="blue", height=1, width=1),
            Text("Sampler GPU 4", font_size=12)
        ).arrange(DOWN, buff=.25, aligned_edge=DOWN)
        sampler_1.move_to([2,2,0])
        sampler_2.move_to([2,.5,0])
        sampler_3.move_to([2,-1.,0])
        sampler_4.move_to([2,-2.5,0])
        self.add(sampler_1, sampler_2, sampler_3, sampler_4)
        samplers = [sampler_1[0], sampler_2[0], sampler_3[0], sampler_4[0]]

        gpu_1 = Group(
            Rectangle(color="white", height=1, width=1),
            Text("Output GPU 1", font_size=12)
        ).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, 2, 0])
        gpu_2 = Group(
            Rectangle(color="white", height=1, width=1),
            Text("Output GPU 2", font_size=12)
        ).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, .5, 0])
        gpu_3 = Group(
            Rectangle(color="white", height=1, width=1),
            Text("Output GPU 3", font_size=12)
        ).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, -1, 0])
        gpu_4 = Group(
            Rectangle(color="white", height=1, width=1),
            Text("Output GPU 4", font_size=12)
        ).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, -2.5, 0])
        gpus = [gpu_1[0], gpu_2[0], gpu_3[0], gpu_4[0]]
        self.add(gpu_1, gpu_2, gpu_3, gpu_4)

        # Animate their existence
        self.play(
            Create(gpu_1[0], run_time=1),
            Create(gpu_2[0], run_time=1),
            Create(gpu_3[0], run_time=1),
            Create(gpu_4[0], run_time=1),
            Create(dataset_recs, run_time=1),
            Create(sampler_1[0], run_time=1),
            Create(sampler_2[0], run_time=1),
            Create(sampler_3[0], run_time=1),
            Create(sampler_4[0], run_time=1),
        )

        first_animations = []
        second_animations = []


        colors = ["BLUE_E", "DARK_BROWN", "GOLD_E", "GRAY_A"]
        current_color = colors[0]
        buff = 0
        lr_buff = .25
        old_target = None
        new_datasets = []
        for i,row_data in enumerate(dataset_recs):
            new_row = []
            current_color = colors[i]
            if i == 0:
                idx = -3
            elif i == 1:
                idx = -2
            elif i == 2:
                idx = -1
            elif i == 3:
                idx = 0
            for j,indiv_data in enumerate(row_data):
                dataset_target = Rectangle(height=0.46/2,width=0.46/2).set_stroke(width=0.).set_fill(current_color, opacity=0.7)
                dataset_target.move_to(indiv_data)
                dataset_target.generate_target()
                aligned_edge = ORIGIN
                if j % 8 == 0:
                    aligned_edge = LEFT
                    dataset_target.target.next_to(
                        samplers[abs(idx)].get_corner(UP+LEFT), buff=.02, direction=RIGHT+DOWN,
                    )
                    dataset_target.target.set_x(dataset_target.target.get_x())
                elif j % 4 == 0:
                    old_target = dataset_target.target
                    dataset_target.target.next_to(
                        samplers[abs(idx)].get_corner(UP+LEFT), buff=.02, direction=RIGHT+DOWN,
                    )
                    dataset_target.target.set_x(dataset_target.target.get_x())
                    dataset_target.target.set_y(dataset_target.target.get_y()-.25)
                else:
                    dataset_target.target.next_to(
                        old_target, direction=RIGHT, buff=0.02,
                    )
                old_target = dataset_target.target
                new_row.append(dataset_target)
                first_animations.append(indiv_data.animate(run_time=0.5).set_stroke(current_color))
                second_animations.append(MoveToTarget(dataset_target, run_time=1.5))
            
            new_datasets.append(new_row)
        step_1 = MarkupText(
            f"Since we splice the dataset between each GPU,\nthe models weights can be averaged during `backward()`\nActing as though we did one giant epoch\nvery quickly.",
            font_size=18
        )
        step_1.move_to([-2.5, -2, 0])

        self.play(
            Write(step_1, run_time=3),
        )
        self.play(
            *first_animations,
        )
        self.play(*second_animations)
        self.wait(duration=.5)

        move_animation = []
        import random
        for i,row in enumerate(new_datasets):
            # row = [row[k] for k in random.sample(range(8), 8)]
            current_color = colors[i]
            if i == 0:
                idx = -3
            elif i == 1:
                idx = -2
            elif i == 2:
                idx = -1
            elif i == 3:
                idx = 0
            for j,indiv_data in enumerate(row):
                indiv_data.generate_target()
                aligned_edge = ORIGIN
                if j % 8 == 0:
                    aligned_edge = LEFT
                    indiv_data.target.next_to(
                        gpus[abs(idx)].get_corner(UP+LEFT), buff=.02, direction=RIGHT+DOWN,
                    )
                    indiv_data.target.set_x(indiv_data.target.get_x())
                elif j % 4 == 0:
                    indiv_data.target.next_to(
                        gpus[abs(idx)].get_corner(UP+LEFT), buff=.02, direction=RIGHT+DOWN,
                    )
                    indiv_data.target.set_x(indiv_data.target.get_x())
                    indiv_data.target.set_y(indiv_data.target.get_y()-.25)
                else:
                    indiv_data.target.next_to(
                        old_target, direction=RIGHT, buff=0.02,
                    )
                old_target = indiv_data.target
                move_animation.append(MoveToTarget(indiv_data, run_time=1.5))

        self.play(*move_animation)
        self.wait()

================================================
FILE: manim_animations/dataloaders/stage_6.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from manim import *


class Stage6(Scene):
    def construct(self):
        # The dataset items
        colors = ["BLUE_E", "DARK_BROWN", "GOLD_E", "GRAY_A"]
        fill = Rectangle(height=0.46,width=0.46).set_stroke(width=0)
        columns = [
            VGroup(*[Rectangle(height=0.25,width=0.25,color=colors[j]) for i in range(8)]).arrange(RIGHT,buff=0)
            for j in range(4)
        ]
        dataset_recs = VGroup(*columns).arrange(UP, buff=0)
        dataset_text = Text("Dataset", font_size=24)
        dataset = Group(dataset_recs,dataset_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
        dataset.move_to([-2,0,0])
        self.add(dataset)
        code = Code(
            code="# We enable this by default\naccelerator = Accelerator()\ndataloader = DataLoader(..., shuffle=True)\ndataloader = accelerator.prepare(dataloader)\nfor batch in dataloader:\n\t...",
            tab_width=4,
            background="window",
            language="Python",
            font="Monospace",
            font_size=14,
            corner_radius=.2,
            insert_line_no=False,
            line_spacing=.75,
            style=Code.styles_list[1],
        )
        code.move_to([-3.5, 2.5, 0])
        self.add(code)

        # The dataloader itself

        sampler_1 = Group(
            Rectangle(color="blue", height=1, width=1),
            Text("Sampler GPU 1", font_size=12)
        ).arrange(DOWN, buff=.25, aligned_edge=DOWN)
        sampler_2 = Group(
            Rectangle(color="blue", height=1, width=1),
            Text("Sampler GPU 2", font_size=12)
        ).arrange(DOWN, buff=.25, aligned_edge=DOWN)
        sampler_3 = Group(
            Rectangle(color="blue", height=1, width=1),
            Text("Sampler GPU 3", font_size=12)
        ).arrange(DOWN, buff=.25, aligned_edge=DOWN)
        sampler_4 = Group(
            Rectangle(color="blue", height=1, width=1),
            Text("Sampler GPU 4", font_size=12)
        ).arrange(DOWN, buff=.25, aligned_edge=DOWN)
        sampler_1.move_to([2,2,0])
        sampler_2.move_to([2,.5,0])
        sampler_3.move_to([2,-1.,0])
        sampler_4.move_to([2,-2.5,0])
        self.add(sampler_1, sampler_2, sampler_3, sampler_4)
        samplers = [sampler_1[0], sampler_2[0], sampler_3[0], sampler_4[0]]

        gpu_1 = Group(
            Rectangle(color="white", height=1, width=1),
            Text("Output GPU 1", font_size=12)
        ).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, 2, 0])
        gpu_2 = Group(
            Rectangle(color="white", height=1, width=1),
            Text("Output GPU 2", font_size=12)
        ).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, .5, 0])
        gpu_3 = Group(
            Rectangle(color="white", height=1, width=1),
            Text("Output GPU 3", font_size=12)
        ).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, -1, 0])
        gpu_4 = Group(
            Rectangle(color="white", height=1, width=1),
            Text("Output GPU 4", font_size=12)
        ).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, -2.5, 0])
        gpus = [gpu_1[0], gpu_2[0], gpu_3[0], gpu_4[0]]
        self.add(gpu_1, gpu_2, gpu_3, gpu_4)


        first_animations = []
        second_animations = []


        colors = ["BLUE_E", "DARK_BROWN", "GOLD_E", "GRAY_A"]
        current_color = colors[0]
        buff = 0
        lr_buff = .25
        old_target = None
        new_datasets = []
        for i,row_data in enumerate(dataset_recs):
            new_row = []
            current_color = colors[i]
            if i == 0:
                idx = -3
            elif i == 1:
                idx = -2
            elif i == 2:
                idx = -1
            elif i == 3:
                idx = 0
            for j,indiv_data in enumerate(row_data):
                dataset_target = Rectangle(height=0.46/2,width=0.46/2).set_stroke(width=0.).set_fill(current_color, opacity=0.7)
                dataset_target.move_to(indiv_data)
                dataset_target.generate_target()
                aligned_edge = ORIGIN
                if j % 8 == 0:
                    aligned_edge = LEFT
                    old_target = dataset_target.target
                    dataset_target.target.next_to(
                        samplers[abs(idx)].get_corner(UP+LEFT), buff=.02, direction=RIGHT+DOWN,
                    )
                    dataset_target.target.set_x(dataset_target.target.get_x())
                elif j % 4 == 0:
                    old_target = dataset_target.target
                    dataset_target.target.next_to(
                        samplers[abs(idx)].get_corner(UP+LEFT), buff=.02, direction=RIGHT+DOWN,
                    )
                    dataset_target.target.set_x(dataset_target.target.get_x())
                    dataset_target.target.set_y(dataset_target.target.get_y()-.25)
                else:
                    dataset_target.target.next_to(
                        old_target, direction=RIGHT, buff=0.02,
                    )
                old_target = dataset_target.target
                new_row.append(dataset_target)
                first_animations.append(indiv_data.animate(run_time=0.5).set_stroke(current_color))
                second_animations.append(MoveToTarget(dataset_target, run_time=1.5))
            
            new_datasets.append(new_row)
        step_1 = MarkupText(
            f"During shuffling, each mini-batch's\noutput order will be modified",
            font_size=18
        )
        step_1.move_to([-1.5, -2, 0])

        self.play(
            Write(step_1, run_time=3),
        )
        self.play(
            *first_animations,
        )
        self.play(*second_animations)
        self.wait(duration=.5)

        move_animation = []
        import random
        for i,row in enumerate(new_datasets):
            row = [row[k] for k in random.sample(range(8), 8)]
            current_color = colors[i]
            if i == 0:
                idx = -3
            elif i == 1:
                idx = -2
            elif i == 2:
                idx = -1
            elif i == 3:
                idx = 0
            for j,indiv_data in enumerate(row):
                indiv_data.generate_target()
                aligned_edge = ORIGIN
                if j % 8 == 0:
                    aligned_edge = LEFT
                    indiv_data.target.next_to(
                        gpus[abs(idx)].get_corner(UP+LEFT), buff=.02, direction=RIGHT+DOWN,
                    )
                    indiv_data.target.set_x(indiv_data.target.get_x())
                elif j % 4 == 0:
                    indiv_data.target.next_to(
                        gpus[abs(idx)].get_corner(UP+LEFT), buff=.02, direction=RIGHT+DOWN,
                    )
                    indiv_data.target.set_x(indiv_data.target.get_x())
                    indiv_data.target.set_y(indiv_data.target.get_y()-.25)
                else:
                    indiv_data.target.next_to(
                        old_target, direction=RIGHT, buff=0.02,
                    )
                old_target = indiv_data.target
                move_animation.append(MoveToTarget(indiv_data, run_time=1.5))

        self.play(*move_animation)
        self.wait()

================================================
FILE: manim_animations/dataloaders/stage_7.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from manim import *

class Stage7(Scene):
    def construct(self):
        # The dataset items        
        code = Code(
            code="accelerator = Accelerator(dispatch_batches=True)\ndataloader = DataLoader(...)\ndataloader = accelerator.prepare(dataloader)\nfor batch in dataloader:\n\t...",
            tab_width=4,
            background="window",
            language="Python",
            font="Monospace",
            font_size=14,
            corner_radius=.2,
            insert_line_no=False,
            line_spacing=.75,
            style=Code.styles_list[1],
        )
        code.move_to([-3.5, 2.5, 0])
        self.add(code)
        colors = ["BLUE_E", "DARK_BROWN", "GOLD_E", "GRAY_A"]
        fill = Rectangle(height=0.46,width=0.46).set_stroke(width=0)
        columns = [
            VGroup(*[Rectangle(height=0.25,width=0.25,color=colors[j]) for i in range(8)]).arrange(RIGHT,buff=0)
            for j in range(4)
        ]
        dataset_recs = VGroup(*columns).arrange(UP, buff=0)
        dataset_text = Text("Dataset", font_size=24)
        dataset = Group(dataset_recs,dataset_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
        dataset.move_to([-2,0,0])
        self.add(dataset)

        # The dataloader itself

        sampler_1 = Group(
            Rectangle(color="blue", height=1.02, width=1.02),
            Text("Sampler GPU 1", font_size=12)
        ).arrange(DOWN, buff=.25, aligned_edge=DOWN)
        sampler_2 = Group(
            Rectangle(color="blue", height=1.02, width=1.02),
            Text("Sampler GPU 2", font_size=12)
        ).arrange(DOWN, buff=.25, aligned_edge=DOWN)
        sampler_3 = Group(
            Rectangle(color="blue", height=1.02, width=1.02),
            Text("Sampler GPU 3", font_size=12)
        ).arrange(DOWN, buff=.25, aligned_edge=DOWN)
        sampler_4 = Group(
            Rectangle(color="blue", height=1.02, width=1.02),
            Text("Sampler GPU 4", font_size=12)
        ).arrange(DOWN, buff=.25, aligned_edge=DOWN)
        sampler_1.move_to([2,2,0])
        sampler_2.move_to([2,.5,0])
        sampler_3.move_to([2,-1.,0])
        sampler_4.move_to([2,-2.5,0])
        self.add(sampler_1, sampler_2, sampler_3, sampler_4)
        samplers = [sampler_1[0], sampler_2[0], sampler_3[0], sampler_4[0]]

        gpu_1 = Group(
            Rectangle(color="white", height=1.02, width=.98),
            Text("Output GPU 1", font_size=12)
        ).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, 2, 0])
        gpu_2 = Group(
            Rectangle(color="white", height=1.02, width=.98),
            Text("Output GPU 2", font_size=12)
        ).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, .5, 0])
        gpu_3 = Group(
            Rectangle(color="white", height=1.02, width=.98),
            Text("Output GPU 3", font_size=12)
        ).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, -1, 0])
        gpu_4 = Group(
            Rectangle(color="white", height=1.02, width=.98),
            Text("Output GPU 4", font_size=12)
        ).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, -2.5, 0])
        gpus = [gpu_1[0], gpu_2[0], gpu_3[0], gpu_4[0]]
        self.add(gpu_1, gpu_2, gpu_3, gpu_4)

        step_1 = MarkupText(
            f"When using a `DataLoaderDispatcher`, all\nof the samples are collected from GPU 0's dataset,\nthen divided and sent to each GPU.\nAs a result, this will be slower.",
            font_size=18
        )
        step_1.move_to([-2.5, -2, 0])

        self.play(
            Write(step_1, run_time=3.5),
        )

        first_animations = []
        second_animations = []


        colors = ["BLUE_E", "DARK_BROWN", "GOLD_E", "GRAY_A"]
        current_color = colors[0]
        ud_buff = 0.01
        lr_buff = 0.01
        old_target = None
        new_datasets = []
        for i,row_data in enumerate(dataset_recs):
            new_row = []
            current_color = colors[i]
                
            for j,indiv_data in enumerate(row_data):
                dataset_target = Rectangle(height=0.46/4,width=0.46/2).set_stroke(width=0.).set_fill(current_color, opacity=0.7)
                dataset_target.move_to(indiv_data)
                dataset_target.generate_target()
                aligned_edge = ORIGIN
                if j % 8 == 0:
                    aligned_edge = LEFT
                    dataset_target.target.next_to(
                        samplers[0].get_corner(DOWN+LEFT), buff=0.0125, direction=RIGHT+UP,
                    )
                    dataset_target.target.set_x(dataset_target.target.get_x())
                    dataset_target.target.set_y(dataset_target.target.get_y() + (.25 * i))
                elif j % 4 == 0:
                    old_target = dataset_target.target
                    dataset_target.target.next_to(
                        samplers[0].get_corner(DOWN+LEFT), buff=0.0125, direction=RIGHT+UP,
                    )
                    dataset_target.target.set_x(dataset_target.target.get_x())
                    dataset_target.target.set_y(dataset_target.target.get_y()+.125 + (.25 * i))
                else:
                    dataset_target.target.next_to(
                        old_target, direction=RIGHT, buff=0.0125,
                    )
                old_target = dataset_target.target
                new_row.append(dataset_target)
                first_animations.append(indiv_data.animate(run_time=0.5).set_stroke(current_color))
                second_animations.append(MoveToTarget(dataset_target, run_time=1.5))
            
            new_datasets.append(new_row)
        self.play(
            *first_animations,
        )
        self.play(*second_animations)
        move_animation = []
        for i,row in enumerate(new_datasets):
            current_color = colors[i]
            if i == 0:
                idx = -3
            elif i == 1:
                idx = -2
            elif i == 2:
                idx = -1
            elif i == 3:
                idx = 0
            for j,indiv_data in enumerate(row):
                indiv_data.generate_target()
                indiv_data.animate.stretch_to_fit_height(0.46/2)
                aligned_edge = ORIGIN
                if j % 8 == 0:
                    aligned_edge = LEFT
                    indiv_data.target.next_to(
                        gpus[abs(idx)].get_corner(UP+LEFT), buff=.01, direction=RIGHT+DOWN,
                    )
                    indiv_data.target.set_x(indiv_data.target.get_x())
                    indiv_data.target.set_y(indiv_data.target.get_y()-.25)
                elif j % 4 == 0:
                    indiv_data.target.next_to(
                        gpus[abs(idx)].get_corner(UP+LEFT), buff=.01, direction=RIGHT+DOWN,
                    )
                    indiv_data.target.set_x(indiv_data.target.get_x())
                else:
                    indiv_data.target.next_to(
                        old_target, direction=RIGHT, buff=0.01,
                    )
                old_target = indiv_data.target
                move_animation.append(MoveToTarget(indiv_data, run_time=1.5))

        self.play(*move_animation)
        self.wait()

================================================
FILE: pyproject.toml
================================================
[tool.ruff]
line-length = 119
target-version = "py310"

[tool.ruff.lint]
preview = true
extend-select = [
    "B009", # static getattr
    "B010", # static setattr
    "CPY", # Copyright
    "E", # PEP8 errors
    "F", # PEP8 formatting
    "I", # Import sorting
    "TID251", # Banned API
    "UP", # Pyupgrade
    "W", # PEP8 warnings
]
ignore = [
    "E501", # Line length (handled by ruff-format)
    "E741", # Ambiguous variable name
    "W605", # Invalid escape sequence
    "UP007", # X | Y type annotations
    "UP045", # Use `X | None` for type annotations
    "UP035", # temporarily disabled to minimize upgrade changes

]

[tool.ruff.lint.per-file-ignores]
"__init__.py" = [
    "F401", # Ignore seemingly unused imports (they're meant for re-export)
]
"manim_animations/*" = ["ALL"]

[tool.ruff.lint.isort]
lines-after-imports = 2
known-first-party = ["accelerate"]

[tool.ruff.format]
exclude = [
    "manim_animations/*"
]

[tool.ruff.lint.flake8-tidy-imports.banned-api]
"os.getenv".msg = "Use os.environ instead"
"os.putenv".msg = "Use os.environ instead"
"os.unsetenv".msg = "Use os.environ instead"


================================================
FILE: setup.py
================================================
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from setuptools import find_packages, setup


extras = {}
extras["quality"] = ["ruff == 0.13.1"]

extras["docs"] = []
extras["test_prod"] = ["pytest>=7.2.0", "pytest-xdist", "pytest-subtests", "parameterized", "pytest-order"]
extras["test_dev"] = [
    "datasets",
    "diffusers",
    "evaluate",
    "torchdata>=0.8.0",
    "torchpippy>=0.2.0",
    "transformers",
    "scipy",
    "scikit-learn",
    "tqdm",
    "bitsandbytes",
    "timm",
]
extras["testing"] = extras["test_prod"] + extras["test_dev"]
extras["deepspeed"] = ["deepspeed"]
extras["rich"] = ["rich"]

extras["test_fp8"] = ["torchao"]  # note: TE for now needs to be done via pulling down the docker image directly
extras["test_trackers"] = [
    "wandb",
    "comet-ml",
    "tensorboard",
    "dvclive",
    # "mlflow", too many deps that lead to download a very old version of the lib
    "matplotlib",
    "swanlab[dashboard]",  # dashboard required for local use
    "trackio",
]
extras["dev"] = extras["quality"] + extras["testing"] + extras["rich"]

extras["sagemaker"] = [
    "sagemaker",  # boto3 is a required package in sagemaker
]

setup(
    name="accelerate",
    version="1.14.0.dev0",
    description="Accelerate",
    long_description=open("README.md", encoding="utf-8").read(),
    long_description_content_type="text/markdown",
    keywords="deep learning",
    license="Apache",
    author="The Hugging Face team",
    author_email="transformers@huggingface.co",
    url="https://github.com/huggingface/accelerate",
    package_dir={"": "src"},
    packages=find_packages("src"),
    entry_points={
        "console_scripts": [
            "accelerate=accelerate.commands.accelerate_cli:main",
            "accelerate-config=accelerate.commands.config:main",
            "accelerate-estimate-memory=accelerate.commands.estimate:main",
            "accelerate-launch=accelerate.commands.launch:main",
            "accelerate-merge-weights=accelerate.commands.merge:main",
        ]
    },
    python_requires=">=3.10.0",
    install_requires=[
        "numpy>=1.17",
        "packaging>=20.0",
        "psutil",
        "pyyaml",
        "torch>=2.0.0",
        "huggingface_hub>=0.21.0",
        "safetensors>=0.4.3",
    ],
    extras_require=extras,
    classifiers=[
        "Development Status :: 5 - Production/Stable",
        "Intended Audience :: Developers",
        "Intended Audience :: Education",
        "Intended Audience :: Science/Research",
        "License :: OSI Approved :: Apache Software License",
        "Operating System :: OS Independent",
        "Programming Language :: Python :: 3",
        "Programming Language :: Python :: 3.10",
        "Topic :: Scientific/Engineering :: Artificial Intelligence",
    ],
)

# Release checklist
# 1. Checkout the release branch (for a patch the current release branch, for a new minor version, create one):
#      git checkout -b vXX.xx-release
#    The -b is only necessary for creation (so remove it when doing a patch)
# 2. Change the version in __init__.py and setup.py to the proper value.
# 3. Commit these changes with the message: "Release: v<VERSION>"
# 4. Add a tag in git to mark the release:
#      git tag v<VERSION> -m 'Adds tag v<VERSION> for pypi'
#    Push the tag and release commit to git: git push --tags origin vXX.xx-release
# 5. Run the following commands in the top-level directory:
#      make prepare_release
# 6. Upload the package to the pypi test server first:
#      make target=testpypi upload_release
# 7. Check that you can install it in a virtualenv by running:
#      make install_test_release
#      accelerate env
#      accelerate test
# 8. Upload the final version to actual pypi:
#      make target=pypi upload_release
# 9. Add release notes to the tag in github once everything is looking hunky-dory.
# 10. Go back to the main branch and update the version in __init__.py, setup.py to the new version ".dev" and push to
#     main.


================================================
FILE: src/accelerate/__init__.py
================================================
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__version__ = "1.14.0.dev0"

from .accelerator import Accelerator
from .big_modeling import (
    cpu_offload,
    cpu_offload_with_hook,
    disk_offload,
    dispatch_model,
    init_empty_weights,
    init_on_device,
    load_checkpoint_and_dispatch,
)
from .data_loader import skip_first_batches
from .inference import prepare_pippy
from .launchers import debug_launcher, notebook_launcher
from .parallelism_config import ParallelismConfig
from .state import PartialState
from .utils import (
    AutocastKwargs,
    DataLoaderConfiguration,
    DDPCommunicationHookType,
    DeepSpeedPlugin,
    DistributedDataParallelKwargs,
    DistributedType,
    FullyShardedDataParallelPlugin,
    GradScalerKwargs,
    InitProcessGroupKwargs,
    ProfileKwargs,
    find_executable_batch_size,
    infer_auto_device_map,
    is_rich_available,
    load_checkpoint_in_model,
    synchronize_rng_states,
)


if is_rich_available():
    from .utils import rich


================================================
FILE: src/accelerate/accelerator.py
================================================
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import contextlib
import functools
import inspect
import json
import math
import os
import re
import shutil
import warnings
from collections import OrderedDict
from contextlib import contextmanager
from functools import partial
from types import MethodType
from typing import Any, Callable, Union

import torch
import torch.utils.hooks as hooks

from accelerate.utils.dataclasses import FP8BackendType

from .big_modeling import _attach_context_parallel_hooks
from .checkpointing import load_accelerator_state, load_custom_state, save_accelerator_state, save_custom_state
from .data_loader import DataLoaderDispatcher, prepare_data_loader, skip_first_batches
from .logging import get_logger
from .optimizer import AcceleratedOptimizer
from .parallelism_config import ParallelismConfig
from .scheduler import AcceleratedScheduler
from .state import AcceleratorState, GradientState, PartialState
from .tracking import LOGGER_TYPE_TO_CLASS, GeneralTracker, filter_trackers
from .utils import (
    MODEL_NAME,
    SAFE_WEIGHTS_INDEX_NAME,
    SAFE_WEIGHTS_NAME,
    SAFE_WEIGHTS_PATTERN_NAME,
    WEIGHTS_INDEX_NAME,
    WEIGHTS_NAME,
    WEIGHTS_PATTERN_NAME,
    AORecipeKwargs,
    AutocastKwargs,
    DataLoaderConfiguration,
    DeepSpeedPlugin,
    DistributedDataParallelKwargs,
    DistributedType,
    DynamoBackend,
    FP8RecipeKwargs,
    FullyShardedDataParallelPlugin,
    GradientAccumulationPlugin,
    GradScalerKwargs,
    InitProcessGroupKwargs,
    KwargsHandler,
    LoggerType,
    MegatronLMPlugin,
    MSAMPRecipeKwargs,
    PrecisionType,
    ProfileKwargs,
    ProjectConfiguration,
    RNGType,
    TERecipeKwargs,
    TorchDynamoPlugin,
    TorchTensorParallelPlugin,
    apply_fp8_autowrap,
    check_os_kernel,
    clean_state_dict_for_safetensors,
    compare_versions,
    convert_model,
    convert_model_to_fp8_ao,
    convert_outputs_to_fp32,
    ensure_weights_retied,
    extract_model_from_parallel,
    fsdp2_apply_ac,
    fsdp2_canonicalize_names,
    fsdp2_prepare_model,
    fsdp2_switch_optimizer_parameters,
    gather,
    gather_object,
    get_fsdp2_grad_scaler,
    get_grad_scaler,
    get_mixed_precision_context_manager,
    get_pretty_name,
    has_offloaded_params,
    is_bf16_available,
    is_bitsandbytes_multi_backend_available,
    is_deepspeed_available,
    is_lomo_available,
    is_megatron_lm_available,
    is_mlu_available,
    is_msamp_available,
    is_musa_available,
    is_npu_available,
    is_torch_version,
    is_torch_xla_available,
    is_torchao_available,
    is_transformer_engine_available,
    is_xpu_available,
    load_fsdp_model,
    load_fsdp_optimizer,
    model_has_dtensor,
    pad_across_processes,
    parse_choice_from_env,
    recursively_apply,
    reduce,
    release_memory,
    save,
    save_fsdp_model,
    save_fsdp_optimizer,
    wait_for_everyone,
)
from .utils.constants import (
    DTENSOR_PYTORCH_VERSION,
    FSDP2_PYTORCH_VERSION,
    FSDP_PYTORCH_VERSION,
    PROFILE_PATTERN_NAME,
    SCALER_NAME,
)
from .utils.modeling import get_state_dict_offloaded_model
from .utils.other import compile_regions, compile_regions_deepspeed, is_compiled_module


if is_deepspeed_available():
    from .utils import (
        DeepSpeedEngineWrapper,
        DeepSpeedOptimizerWrapper,
        DeepSpeedSchedulerWrapper,
        DummyOptim,
        DummyScheduler,
        map_pytorch_optim_to_deepspeed,
    )

if is_megatron_lm_available():
    from .utils import (
        MegatronEngine,
        MegatronLMDummyDataLoader,
        MegatronLMDummyScheduler,
        MegatronLMOptimizerWrapper,
        MegatronLMSchedulerWrapper,
        megatron_lm_initialize,
        megatron_lm_prepare_data_loader,
        megatron_lm_prepare_model_optimizer_scheduler,
    )

if torch.distributed.is_available():
    from torch.distributed.algorithms.join import Join


if is_torch_xla_available():
    import torch_xla.core.xla_model as xm
    import torch_xla.distributed.xla_multiprocessing as xmp


if is_npu_available(check_device=False):
    import torch_npu  # noqa: F401


try:
    from torch.optim.lr_scheduler import LRScheduler
except ImportError:
    from torch.optim.lr_scheduler import _LRScheduler as LRScheduler

logger = get_logger(__name__)

# Sentinel values for defaults
_split_batches = object()
_dispatch_batches = object()
_even_batches = object()
_use_seedable_sampler = object()


class Accelerator:
    """
    Creates an instance of an accelerator for distributed training or mixed precision training.

    Args:
        device_placement (`bool`, *optional*, defaults to `True`):
            Whether or not the accelerator should put objects on device (tensors yielded by the dataloader, model,
            etc...).
        mixed_precision (`str`, *optional*):
            Whether or not to use mixed precision training. Choose from 'no','fp16','bf16' or 'fp8'. Will default to
            the value in the environment variable `ACCELERATE_MIXED_PRECISION`, which will use the default value in the
            accelerate config of the current system or the flag passed with the `accelerate.launch` command. 'fp8'
            requires the installation of transformers-engine.
        gradient_accumulation_steps (`int`, *optional*, default to 1):
            The number of steps that should pass before gradients are accumulated. A number > 1 should be combined with
            `Accelerator.accumulate`. If not passed, will default to the value in the environment variable
            `ACCELERATE_GRADIENT_ACCUMULATION_STEPS`. Can also be configured through a `GradientAccumulationPlugin`.
        cpu (`bool`, *optional*):
            Whether or not to force the script to execute on CPU. Will ignore GPU available if set to `True` and force
            the execution on one process only.
        dataloader_config (`DataLoaderConfiguration`, *optional*):
            A configuration for how the dataloaders should be handled in distributed scenarios.
        deepspeed_plugin ([`~utils.DeepSpeedPlugin`] or dict of `str`: [`~utils.DeepSpeedPlugin`], *optional*):
            Tweak your DeepSpeed related args using this argument. This argument is optional and can be configured
            directly using *accelerate config*. If using multiple plugins, use the configured `key` property of each
            plugin to access them from `accelerator.state.get_deepspeed_plugin(key)`. Alias for `deepspeed_plugins`.
        fsdp_plugin ([`~utils.FullyShardedDataParallelPlugin`], *optional*):
            Tweak your FSDP related args using this argument. This argument is optional and can be configured directly
            using *accelerate config*
        torch_tp_plugin ([`~utils.TorchTensorParallelPlugin`], *optional*):
            Deprecated: use `parallelism_config` with `tp_size` instead.
        megatron_lm_plugin ([`~utils.MegatronLMPlugin`], *optional*):
            Tweak your MegatronLM related args using this argument. This argument is optional and can be configured
            directly using *accelerate config*
        rng_types (list of `str` or [`~utils.RNGType`]):
            The list of random number generators to synchronize at the beginning of each iteration in your prepared
            dataloaders. Should be one or several of:

            - `"torch"`: the base torch random number generator
            - `"cuda"`: the CUDA random number generator (GPU only)
            - `"xla"`: the XLA random number generator (TPU only)
            - `"generator"`: the `torch.Generator` of the sampler (or batch sampler if there is no sampler in your
              dataloader) or of the iterable dataset (if it exists) if the underlying dataset is of that type.

            Will default to `["torch"]` for PyTorch versions <=1.5.1 and `["generator"]` for PyTorch versions >= 1.6.
        log_with (list of `str`, [`~utils.LoggerType`] or [`~tracking.GeneralTracker`], *optional*):
            A list of loggers to be setup for experiment tracking. Should be one or several of:

            - `"all"`
            - `"tensorboard"`
            - `"wandb"`
            - `"trackio"`
            - `"aim"`
            - `"comet_ml"`
            - `"mlflow"`
            - `"dvclive"`
            - `"swanlab"`
            If `"all"` is selected, will pick up all available trackers in the environment and initialize them. Can
            also accept implementations of `GeneralTracker` for custom trackers, and can be combined with `"all"`.
        project_config ([`~utils.ProjectConfiguration`], *optional*):
            A configuration for how saving the state can be handled.
        project_dir (`str`, `os.PathLike`, *optional*):
            A path to a directory for storing data such as logs of locally-compatible loggers and potentially saved
            checkpoints.
        step_scheduler_with_optimizer (`bool`, *optional*, defaults to `True`):
            Set `True` if the learning rate scheduler is stepped at the same time as the optimizer, `False` if only
            done under certain circumstances (at the end of each epoch, for instance).
        kwargs_handlers (list of [`~utils.KwargsHandler`], *optional*)
            A list of [`~utils.KwargsHandler`] to customize how the objects related to distributed training, profiling
            or mixed precision are created. See [kwargs](kwargs) for more information.
        dynamo_backend (`str` or [`~utils.DynamoBackend`], *optional*, defaults to `"no"`):
            Set to one of the possible dynamo backends to optimize your training with torch dynamo.
        dynamo_plugin ([`~utils.TorchDynamoPlugin`], *optional*):
            A configuration for how torch dynamo should be handled, if more tweaking than just the `backend` or `mode`
            is needed.
        gradient_accumulation_plugin ([`~utils.GradientAccumulationPlugin`], *optional*):
            A configuration for how gradient accumulation should be handled, if more tweaking than just the
            `gradient_accumulation_steps` is needed.

    **Available attributes:**

        - **device** (`torch.device`) -- The device to use.
        - **distributed_type** ([`~utils.DistributedType`]) -- The distributed training configuration.
        - **local_process_index** (`int`) -- The process index on the current machine.
        - **mixed_precision** (`str`) -- The configured mixed precision mode.
        - **num_processes** (`int`) -- The total number of processes used for training.
        - **optimizer_step_was_skipped** (`bool`) -- Whether or not the optimizer update was skipped (because of
          gradient overflow in mixed precision), in which
        case the learning rate should not be changed.
        - **process_index** (`int`) -- The overall index of the current process among all processes.
        - **state** ([`~state.AcceleratorState`]) -- The distributed setup state.
        - **sync_gradients** (`bool`) -- Whether the gradients are currently being synced across all processes.
        - **use_distributed** (`bool`) -- Whether the current configuration is for distributed training.
    """

    def __init__(
        self,
        device_placement: bool = True,
        split_batches: bool = _split_batches,
        mixed_precision: PrecisionType | str | None = None,
        gradient_accumulation_steps: int = 1,
        cpu: bool = False,
        dataloader_config: DataLoaderConfiguration | None = None,
        deepspeed_plugin: DeepSpeedPlugin | dict[str, DeepSpeedPlugin] | None = None,
        fsdp_plugin: FullyShardedDataParallelPlugin | None = None,
        torch_tp_plugin: TorchTensorParallelPlugin | None = None,  # Deprecate later, warning in `post_init`
        megatron_lm_plugin: MegatronLMPlugin | None = None,
        rng_types: list[str | RNGType] | None = None,
        log_with: str | LoggerType | GeneralTracker | list[str | LoggerType | GeneralTracker] | None = None,
        project_dir: str | os.PathLike | None = None,
        project_config: ProjectConfiguration | None = None,
        gradient_accumulation_plugin: GradientAccumulationPlugin | None = None,
        step_scheduler_with_optimizer: bool = True,
        kwargs_handlers: list[KwargsHandler] | None = None,
        dynamo_backend: DynamoBackend | str | None = None,
        dynamo_plugin: TorchDynamoPlugin | None = None,
        deepspeed_plugins: DeepSpeedPlugin | dict[str, DeepSpeedPlugin] | None = None,
        parallelism_config: ParallelismConfig | None = None,
    ):
        self.trackers = []
        if project_config is not None:
            self.project_configuration = project_config
        else:
            self.project_configuration = ProjectConfiguration(project_dir=project_dir)
        if project_dir is not None and self.project_dir is None:
            self.project_configuration.set_directories(project_dir)

        if mixed_precision is not None:
            mixed_precision = str(mixed_precision)
            if mixed_precision not in PrecisionType:
                raise ValueError(
                    f"Unknown mixed_precision mode: {mixed_precision}. Choose between {PrecisionType.list()}"
                )
        if torch_tp_plugin is not None:
            warnings.warn(
                "`TorchTensorParallelPlugin` is deprecated and will be removed in a future version of Accelerate. "
                "Please use the `ParallelismConfig` with `tp_size` instead.",
                FutureWarning,
            )

        if dynamo_plugin is not None and dynamo_backend is not None:
            raise ValueError("You cannot pass in both `dynamo_plugin` and `dynamo_backend`, please only pass in one.")
        if dynamo_backend is not None:
            dynamo_plugin = TorchDynamoPlugin(backend=dynamo_backend)
        elif dynamo_plugin is None:
            dynamo_plugin = TorchDynamoPlugin()

        if deepspeed_plugins is not None and deepspeed_plugin is not None:
            raise ValueError("You cannot pass in both `deepspeed_plugins` and `deepspeed_plugin`.")
        elif deepspeed_plugin is not None:
            deepspeed_plugins = deepspeed_plugin

        if deepspeed_plugins is None:
            # First check if we're creating another `Accelerator` w/o setting `deepspeed_plugin`
            if (
                AcceleratorState._shared_state != {}
                and AcceleratorState().distributed_type == DistributedType.DEEPSPEED
            ):
                deepspeed_plugins = AcceleratorState().deepspeed_plugins
            else:
                # init from env variables
                deepspeed_plugins = (
                    DeepSpeedPlugin()
                    if os.environ.get("ACCELERATE_USE_DEEPSPEED", "false").lower() == "true"
                    else None
                )
        else:
            # If we're creating a second `Accelerator`, users shouldn't be passing in a `deepspeed_plugin`
            if (
                AcceleratorState._shared_state != {}
                and AcceleratorState().distributed_type == DistributedType.DEEPSPEED
                and AcceleratorState().deepspeed_plugins is not None
            ):
                raise NotImplementedError(
                    "You cannot pass in a `deepspeed_plugin` when creating a second `Accelerator`. "
                    "Please make sure the first `Accelerator` is initialized with all the plugins you want to use."
                )
            if isinstance(deepspeed_plugins, dict):
                for plugin in deepspeed_plugins.values():
                    if not isinstance(plugin, DeepSpeedPlugin):
                        raise TypeError("`deepspeed_plugin` must be a DeepSpeedPlugin object.")

        if deepspeed_plugins is not None:
            os.environ["ACCELERATE_USE_DEEPSPEED"] = "true"  # use DeepSpeed if plugin is provided
            if not is_deepspeed_available():
                raise ImportError("DeepSpeed is not installed => run `pip install deepspeed` or build it from source.")
            if is_mlu_available():
                if compare_versions("deepspeed", "<", "0.15.2"):
                    raise ImportError("DeepSpeed MLU version must be >= 0.15.2. Please update DeepSpeed.")
            elif is_musa_available():
                if compare_versions("deepspeed", "<", "0.14.3"):
                    raise ImportError("DeepSpeed MUSA version must be >= 0.14.3. Please update DeepSpeed.")
            elif compare_versions("deepspeed", "<", "0.9.3"):
                raise ImportError("DeepSpeed version must be >= 0.9.3. Please update DeepSpeed.")

            self.deepspeed_engine_wrapped = None

        if os.environ.get("ACCELERATE_USE_FSDP", "false").lower() == "true" or isinstance(
            fsdp_plugin, FullyShardedDataParallelPlugin
        ):
            if not is_torch_version(">=", FSDP_PYTORCH_VERSION):
                raise ValueError(f"FSDP requires PyTorch >= {FSDP_PYTORCH_VERSION}")

        if fsdp_plugin is None:  # init from env variables
            fsdp_plugin = (
                FullyShardedDataParallelPlugin()
                if os.environ.get("ACCELERATE_USE_FSDP", "false").lower() == "true"
                else None
            )
        else:
            if not isinstance(fsdp_plugin, FullyShardedDataParallelPlugin):
                raise TypeError("`fsdp_plugin` must be a FullyShardedDataParallelPlugin object.")
            os.environ["ACCELERATE_USE_FSDP"] = "true"  # use FSDP if plugin is provided

        if fsdp_plugin is not None and fsdp_plugin.fsdp_version == 2:
            if not is_torch_version(">=", FSDP2_PYTORCH_VERSION):
                raise ImportError(f"FSDP2 requires PyTorch >= {FSDP2_PYTORCH_VERSION}")

        if megatron_lm_plugin is None:  # init from env variables
            megatron_lm_plugin = (
                MegatronLMPlugin() if os.environ.get("ACCELERATE_USE_MEGATRON_LM", "false").lower() == "true" else None
            )
        else:
            if not isinstance(megatron_lm_plugin, MegatronLMPlugin):
                raise TypeError("`megatron_lm_plugin` must be a MegatronLMPlugin object.")
            os.environ["ACCELERATE_USE_MEGATRON_LM"] = "true"  # use MegatronLM if plugin is provided

        if megatron_lm_plugin:
            if not is_megatron_lm_available():
                raise ImportError("Megatron is not installed. please build it from source.")

        # Kwargs handlers
        self.ddp_handler = None
        self.scaler_handler = None
        self.init_handler = None
        self.fp8_recipe_handler = None
        self.ao_recipe_handler = None
        self.te_recipe_handler = None
        self.msamp_recipe_handler = None
        self.autocast_handler = None
        self.profile_handler = None
        self.has_lomo_optimizer = False

        found_handlers = set()
        handler_class_to_attr = {
            DistributedDataParallelKwargs: "ddp_handler",
            GradScalerKwargs: "scaler_handler",
            InitProcessGroupKwargs: "init_handler",
            FP8RecipeKwargs: "fp8_recipe_handler",
            AutocastKwargs: "autocast_handler",
            ProfileKwargs: "profile_handler",
            AORecipeKwargs: "ao_recipe_handler",
            TERecipeKwargs: "te_recipe_handler",
            MSAMPRecipeKwargs: "msamp_recipe_handler",
        }
        self.has_fp8_handler = False
        if kwargs_handlers is not None:
            for handler in kwargs_handlers:
                assert isinstance(handler, KwargsHandler), (
                    f"Unsupported kwargs handler passed: {handler}, must be one that inherits `accelerate.utils.KwargsHandler`."
                )
                # Add the handler class to the set of found handlers
                if handler.__class__ in found_handlers:
                    raise ValueError(f"You can only pass one {handler.__class__} in `kwargs_handlers`.")
                found_handlers.add(handler.__class__)
                handler_attr = handler_class_to_attr[handler.__class__]
                setattr(self, handler_attr, handler)
                if "recipe_handler" in handler_attr and not self.has_fp8_handler:
                    self.has_fp8_handler = True

        if parallelism_config is None:
            # TODO: Remove after deprecating tp_plugin
            if torch_tp_plugin is not None:
                parallelism_config = ParallelismConfig(tp_size=torch_tp_plugin.tp_size)
            elif os.environ.get("ACCELERATE_USE_PARALLELISM_CONFIG", "false").lower() == "true":
                parallelism_config = ParallelismConfig()

        kwargs = self.init_handler.to_kwargs() if self.init_handler is not None else {}
        self.state = AcceleratorState(
            mixed_precision=mixed_precision,
            cpu=cpu,
            dynamo_plugin=dynamo_plugin,
            deepspeed_plugin=deepspeed_plugins,
            fsdp_plugin=fsdp_plugin,
            megatron_lm_plugin=megatron_lm_plugin,
            parallelism_config=parallelism_config,
            _from_accelerator=True,
            **kwargs,
        )

        if self.parallelism_config:
            self.state.device_mesh = self.parallelism_config.get_device_mesh(self.device.type)
            self.parallelism_config._validate_accelerator(self)

        self.fp8_enabled = self.state.mixed_precision == "fp8" or mixed_precision == "fp8"
        # Check for automatic FP8 recipe creation
        if self.fp8_enabled and not self.has_fp8_handler:
            if self.fp8_backend == FP8BackendType.AO:
                self.ao_recipe_handler = AORecipeKwargs()
            elif self.fp8_backend == FP8BackendType.TE:
                self.te_recipe_handler = TERecipeKwargs()
            elif self.fp8_backend == FP8BackendType.MSAMP:
                self.msamp_recipe_handler = MSAMPRecipeKwargs()
            elif self.fp8_backend == FP8BackendType.NO:
                # Prioritize AO -> TE -> MSAMP
                if is_torchao_available():
                    logger.info("Found `torchao` installed, using it for FP8 training.")
                    self.ao_recipe_handler = AORecipeKwargs()
                elif is_transformer_engine_available():
                    logger.info("Found `transformer-engine` installed, using it for FP8 training.")
                    self.te_recipe_handler = TERecipeKwargs()
                elif is_msamp_available():
                    logger.info("Found `msamp` installed, using it for FP8 training.")
                    self.msamp_recipe_handler = MSAMPRecipeKwargs()
                else:
                    raise ImportError(
                        "Tried to train with `fp8` and auto-detect backend, but no FP8-compatible backend was installed. "
                        "Valid backends are: `torchao`, `transformer-engine`, and `msamp`."
                    )
            self.has_fp8_handler = True

        self.delayed_fp8_autocast = False
        if self.has_fp8_handler:
            # We already check if FP8 is available during `self.state`
            if not self.fp8_enabled and (
                self.distributed_type not in (DistributedType.FSDP, DistributedType.DEEPSPEED)
            ):
                raise ValueError("Passing in an FP8 configuration requires setting `mixed_precision='fp8'`.")
            self.delayed_fp8_autocast = self.fp8_backend == "TE" and self.distributed_type in (
                DistributedType.MULTI_GPU,
                DistributedType.FSDP,
            )

        # TODO: S1ro - this is probably gonna be a problem with other fp8 backends too
        if (
            self.fp8_backend == FP8BackendType.AO
            and self.state.distributed_type == DistributedType.FSDP
            and self.state.fsdp_plugin.cpu_ram_efficient_loading
        ):
            raise ValueError(
                "torchao with FSDP2 and cpu_ram_efficient_loading is not supported, setting `cpu_ram_efficient_loading` to False will fix the issue and work as intended."
            )

        trackers = filter_trackers(log_with, self.logging_dir)
        if len(trackers) < 1 and log_with is not None:
            warnings.warn(f"`log_with={log_with}` was passed but no supported trackers are currently installed.")
        self.log_with = trackers

        if (
            (mixed_precision != "bf16")
            and getattr(self.state, "downcast_bfloat", False)
            and (self.state.distributedType != DistributedType.XLA)
        ):
            raise ValueError("Can only use `downcast_bf16` when using `mixed_precision='bf16'` and on a TPU")

        if gradient_accumulation_plugin is not None:
            if gradient_accumulation_steps != 1:
                raise ValueError(
                    "You can only pass one of `gradient_accumulation_steps` and `gradient_accumulation_plugin`. Please only pass in the created `GradientAccumulationPlugin` object."
                )
        else:
            gradient_accumulation_steps = int(
                parse_choice_from_env("ACCELERATE_GRADIENT_ACCUMULATION_STEPS", gradient_accumulation_steps)
            )
            gradient_accumulation_plugin = GradientAccumulationPlugin(num_steps=gradient_accumulation_steps)

        # If using DeepSpeed, update gradient accumulation steps from the DeepSpeed plugin
        self.gradient_state = GradientState(
            gradient_accumulation_plugin=gradient_accumulation_plugin,
        )

        self.device_placement = device_placement
        if dataloader_config is None:
            dataloader_config = DataLoaderConfiguration()
        self.dataloader_config = dataloader_config
        self.step_scheduler_with_optimizer = step_scheduler_with_optimizer

        # Mixed precision attributes
        self.scaler = None
        self.native_amp = False
        if (
            self.state.mixed_precision == "fp16"
            and self.device.type != "cpu"
            and self.distributed_type not in (DistributedType.DEEPSPEED, DistributedType.MEGATRON_LM)
        ):
            self.native_amp = True
            supported_device = ("xpu", "cuda", "npu", "xla", "mlu", "musa", "hpu", "sdaa", "mps")
            if self.device.type not in supported_device or is_torch_xla_available(check_is_tpu=True):
                raise ValueError(
                    f"fp16 mixed precision requires a device in {supported_device} (not {self.device.type!r})."
                )
            if self.device.type == "mps" and not is_torch_version(">=", "2.5.0"):
                raise ValueError("fp16 mixed precision with MPS device requires a Pytorch >= 2.5.0")
            kwargs = self.scaler_handler.to_kwargs() if self.scaler_handler is not None else {}

            # FSDP2 doesn't use ShardedGradScaler, don't want to modify `get_grad_scaler`, rather create a simple utility
            if self.is_fsdp2:
                self.scaler = get_fsdp2_grad_scaler(device=self.device.type, **kwargs)
            else:
                self.scaler = get_grad_scaler(self.distributed_type, **kwargs)

        elif self.state.mixed_precision == "bf16" and self.distributed_type not in (
            DistributedType.DEEPSPEED,
            DistributedType.MEGATRON_LM,
        ):
            if self.device.type in ["cpu", "xpu", "hpu"]:
                self.native_amp = True
            else:
                self.native_amp = is_bf16_available(True)
            if not self.native_amp and not is_torch_xla_available():
                raise ValueError("bf16 mixed precision requires PyTorch >= 1.10 and a supported device.")
            if self.native_amp and self.device.type == "mps" and not is_torch_version(">=", "2.6.0"):
                raise ValueError("bf16 mixed precision with MPS device requires a Pytorch >= 2.6.0")

        # for DeepSpeed,  self.state.mixed_precision is always "bf16",
        # see https://github.com/huggingface/accelerate/blob/main/src/accelerate/state.py#L968 and
        # https://github.com/huggingface/accelerate/blob/main/src/accelerate/utils/dataclasses.py#L1263.
        elif self.fp8_enabled:
            # We always enable `native_amp` for FP8
            self.native_amp = True
            if self.fp8_backend == FP8BackendType.MSAMP:
                if self.distributed_type == DistributedType.FSDP:
                    raise NotImplementedError(
                        "`accelerate` + `MS-AMP` + `FSDP` is not supported at this time. "
                        "Please consider using deepspeed, which is supported."
                    )
                elif self.distributed_type != DistributedType.DEEPSPEED:
                    # MS-AMP requires `GradScaler` even with bf16 autocast w/ single GPU or DDP:
                    self.scaler = get_grad_scaler(**kwargs)

        # Start of internal step tracking
        self.step = 0

        # Internal references to the training objects
        self._optimizers = []
        self._models = []
        self._schedulers = []
        self._dataloaders = []
        self._custom_objects = []

        # Hooks
        self._load_model_state_pre_hook = OrderedDict()
        self._save_model_state_pre_hook = OrderedDict()

        # RNG Types
        self.rng_types = rng_types
        if self.rng_types is None:
            self.rng_types = ["generator"]

        # Set a flag tensor for early stopping and other breakpoints
        self.flag_tensor = None

        check_os_kernel()

    @property
    def deepspeed_plugin(self):
        """
        Returns the currently active DeepSpeedPlugin.

        If using multiple plugins, the first one will be the active one by default. Manually call
        `accelerator.state.select_deepspeed_plugin(key)` to activate a different plugin.

        If deepspeed is not enabled, this will return `None`.
        """
        return self.state.deepspeed_plugin

    @property
    def use_distributed(self):
        """
        Whether the Accelerator is configured for distributed training
        """
        return self.state.use_distributed

    @property
    def multi_device(self):
        return self.use_distributed and self.distributed_type in (
            DistributedType.MULTI_GPU,
            DistributedType.MULTI_MLU,
            DistributedType.MULTI_SDAA,
            DistributedType.MULTI_MUSA,
            DistributedType.MULTI_NPU,
            DistributedType.MULTI_XPU,
            DistributedType.MULTI_HPU,
            DistributedType.MULTI_NEURON,
        )

    @property
    def distributed_type(self):
        return self.state.distributed_type

    @property
    def num_processes(self):
        return self.state.num_processes

    @property
    def process_index(self):
        return self.state.process_index

    @property
    def local_process_index(self):
        return self.state.local_process_index

    @property
    def device(self):
        return self.state.device

    @property
    def split_batches(self):
        return self.dataloader_config.split_batches

    @property
    def dispatch_batches(self):
        return self.dataloader_config.dispatch_batches

    @property
    def even_batches(self):
        return self.dataloader_config.even_batches

    @even_batches.setter
    def even_batches(self, value: bool):
        self.dataloader_config.even_batches = value

    @property
    def use_seedable_sampler(self):
        return self.dataloader_config.use_seedable_sampler

    @property
    def non_blocking(self):
        return self.dataloader_config.non_blocking

    @property
    def use_stateful_dataloader(self):
        if hasattr(self.dataloader_config, "use_stateful_dataloader"):
            return self.dataloader_config.use_stateful_dataloader
        return False

    @property
    def project_dir(self):
        return self.project_configuration.project_dir

    @property
    def logging_dir(self):
        return self.project_configuration.logging_dir

    @property
    def save_iteration(self):
        return self.project_configuration.iteration

    @property
    def is_main_process(self):
        """True for one process only."""
        return self.state.is_main_process

    @property
    def is_local_main_process(self):
        """True for one process per server."""
        return self.state.is_local_main_process

    @property
    def is_last_process(self):
        return self.process_index == self.num_processes - 1

    @property
    def mixed_precision(self):
        return self.state.mixed_precision

    @property
    def is_fsdp2(self):
        return self.state.is_fsdp2

    @property
    def is_composable_parallelism_enabled(self):
        return self.is_fsdp2

    @property
    def parallelism_config(self) -> Union[ParallelismConfig, None]:
        return self.state.parallelism_config

    @property
    def torch_device_mesh(self):
        return self.state.device_mesh

    @property
    def should_save_model(self):
        if (pc := self.parallelism_config) is None:
            # shouldn't even happen
            return self.state.is_local_main_process
        _non_model_shard_dims = {
            pc.dp_replicate_enabled: "dp_replicate",
            pc.cp_enabled: "cp",
        }

        # return all(
        #     self.torch_device_mesh[dim].get_local_rank() == 0 for key, dim in non_model_shard_dims.items() if key
        # )
        # TODO: S1ro - this is a temporary solution until we figure out why `save_safe_file` is slow when not all processes
        return True

    @property
    def tensor_parallel_rank(self) -> int:
        """
        Returns the local rank for tensor parallelism. If tensor parallelism is configured but not enabled, returns 0
        since all ranks are assumed to be the same.
        """
        if self.parallelism_config:
            if self.parallelism_config.tp_enabled:
                return self.torch_device_mesh.get_local_rank("tp")
            return 0
        raise RuntimeError("Tensor parallelism is not configured. Set `parallelism_config` first.")

    @property
    def pipeline_parallel_rank(self) -> int:
        """
        Pipeline parallelism is not supported yet.
        """
        raise NotImplementedError("Pipeline parallelism is currently not supported in Accelerate.")

    @property
    def context_parallel_rank(self) -> int:
        """
        Context parallelism is not supported yet.
        """
        raise NotImplementedError("Context parallelism is currently not supported in Accelerate.")

    @property
    def data_parallel_rank(self) -> int:
        """
        Returns the local rank for replicate-based data parallelism. If replicate-based data parallelism is configured
        but not enabled, returns 0 since all ranks are assumed to be the same.
        """
        if self.parallelism_config:
            if self.parallelism_config.dp_replicate_enabled:
                return self.torch_device_mesh.get_local_rank("dp_replicate")
            return 0
        raise RuntimeError("Data parallelism is not configured. Set `parallelism_config` first.")

    @property
    def data_parallel_shard_rank(self) -> int:
        """
        Returns the local rank for shard-based data parallelism. If shard-based data parallelism is configured but not
        enabled, returns 0 since all ranks are assumed to be the same.
        """
        if self.parallelism_config:
            if self.parallelism_config.dp_shard_enabled:
                return self.torch_device_mesh.get_local_rank("dp_shard")
            return 0
        raise RuntimeError("Shard-based data parallelism is not configured. Set `parallelism_config` first.")

    @contextmanager
    def split_between_processes(self, inputs: list | tuple | dict | torch.Tensor, apply_padding: bool = False):
        """
        Splits `input` between `self.num_processes` quickly and can be then used on that process. Useful when doing
        distributed inference, such as with different prompts.

        Note that when using a `dict`, all keys need to have the same number of elements.

        Args:
            inputs (`list`, `tuple`, `torch.Tensor`, or `dict` of `list`/`tuple`/`torch.Tensor`):
                The input to split between processes.
            apply_padding (`bool`, `optional`, defaults to `False`):
                Whether to apply padding by repeating the last element of the input so that all processes have the same
                number of elements. Useful when trying to perform actions such as `Accelerator.gather()` on the outputs
                or passing in less inputs than there are processes. If so, just remember to drop the padded elements
                afterwards.

        Example:

        ```python
        # Assume there are two processes
        from accelerate import Accelerator

        accelerator = Accelerator()
        with accelerator.split_between_processes(["A", "B", "C"]) as inputs:
            print(inputs)
        # Process 0
        ["A", "B"]
        # Process 1
        ["C"]

        with accelerator.split_between_processes(["A", "B", "C"], apply_padding=True) as inputs:
            print(inputs)
        # Process 0
        ["A", "B"]
        # Process 1
        ["C", "C"]
        ```
        """
        with PartialState().split_between_processes(inputs, apply_padding=apply_padding) as inputs:
            yield inputs

    def on_main_process(self, function: Callable[..., Any] | None = None):
        """
        A decorator that will run the decorated function on the main process only. Can also be called using the
        `PartialState` class.

        Args:
            function (`Callable`): The function to decorate.

        Example:

        ```python
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator()


        >>> @accelerator.on_main_process
        ... def print_something():
        ...     print("This will be printed by process 0 only.")


        >>> print_something()
        "This will be printed by process 0 only"
        ```
        """
        # For times when the `Accelerator` object itself utilizes this decorator.
        if function is None:
            if "Accelerator." in self.__qualname__:
                function = self
            else:
                raise ValueError(
                    "The `on_main_process` decorator must be called with a function on an instantiated `Accelerator` object."
                )

        def _inner(*args, **kwargs):
            return PartialState().on_main_process(function)(*args, **kwargs)

        return _inner

    def on_local_main_process(self, function: Callable[..., Any] | None = None):
        """
        A decorator that will run the decorated function on the local main process only. Can also be called using the
        `PartialState` class.

        Args:
            function (`Callable`): The function to decorate.

        Example:
        ```python
        # Assume we have 2 servers with 4 processes each.
        from accelerate import Accelerator

        accelerator = Accelerator()


        @accelerator.on_local_main_process
        def print_something():
            print("This will be printed by process 0 only on each server.")


        print_something()
        # On server 1:
        "This will be printed by process 0 only"
        # On server 2:
        "This will be printed by process 0 only"
        ```
        """
        # For times when the `Accelerator` object itself utilizes this decorator.
        if function is None:
            if "Accelerator." in self.__qualname__:
                function = self
            else:
                raise ValueError(
                    "The `on_local_main_process` decorator must be called with a function on an instantiated `Accelerator` object."
                )

        def _inner(*args, **kwargs):
            return PartialState().on_local_main_process(function)(*args, **kwargs)

        return _inner

    def on_last_process(self, function: Callable[..., Any]):
        """
        A decorator that will run the decorated function on the last process only. Can also be called using the
        `PartialState` class.

        Args:
            function (`Callable`): The function to decorate.

        Example:
        ```python
        # Assume we have 4 processes.
        from accelerate import Accelerator

        accelerator = Accelerator()


        @accelerator.on_last_process
        def print_something():
            print(f"Printed on process {accelerator.process_index}")


        print_something()
        "Printed on process 3"
        ```
        """
        # For times when the `Accelerator` object itself utilizes this decorator.
        if function is None:
            if "Accelerator." in self.__qualname__:
                function = self
            else:
                raise ValueError(
                    "The `on_last_process` decorator must be called with a function on an instantiated `Accelerator` object."
                )

        def _inner(*args, **kwargs):
            return PartialState().on_last_process(function)(*args, **kwargs)

        return _inner

    def on_process(self, function: Callable[..., Any] | None = None, process_index: int | None = None):
        """
        A decorator that will run the decorated function on a given process index only. Can also be called using the
        `PartialState` class.

        Args:
            function (`Callable`, `optional`):
                The function to decorate.
            process_index (`int`, `optional`):
                The index of the process on which to run the function.

        Example:
        ```python
        # Assume we have 4 processes.
        from accelerate import Accelerator

        accelerator = Accelerator()


        @accelerator.on_process(process_index=2)
        def print_something():
            print(f"Printed on process {accelerator.process_index}")


        print_something()
        "Printed on process 2"
        ```
        """
        # Initial construction of the decorator.
        if (self is not None) and (process_index is not None) and (function is None):
            return partial(self.on_process, process_index=process_index)
        # For times when the `Accelerator` object itself utilizes this decorator.
        if function is None:
            if "Accelerator." in self.__qualname__:
                function = self
            else:
                raise ValueError(
                    "The `on_main_process` decorator must be called with a function on an instantiated `Accelerator` object."
                )

        def _inner(*args, **kwargs):
            return PartialState().on_process(function, process_index)(*args, **kwargs)

        return _inner

    def on_local_process(self, function: Callable[..., Any] | None = None, local_process_index: int | None = None):
        """
        A decorator that will run the decorated function on a given local process index only. Can also be called using
        the `PartialState` class.

        Args:
            function (`Callable`, *optional*):
                The function to decorate.
            local_process_index (`int`, *optional*):
                The index of the local process on which to run the function.

        Example:
        ```python
        # Assume we have 2 servers with 4 processes each.
        from accelerate import Accelerator

        accelerator = Accelerator()


        @accelerator.on_local_process(local_process_index=2)
        def print_something():
            print(f"Printed on process {accelerator.local_process_index}")


        print_something()
        # On server 1:
        "Printed on process 2"
        # On server 2:
        "Printed on process 2"
        ```
        """
        # Initial construction of the decorator.
        if (self is not None) and (local_process_index is not None) and (function is None):
            return partial(self.on_local_process, local_process_index=local_process_index)
        # For times when the `Accelerator` object itself utilizes this decorator.
        if function is None:
            if "Accelerator." in self.__qualname__:
                function = self
            else:
                raise ValueError(
                    "The `on_main_process` decorator must be called with a function on an instantiated `Accelerator` object."
                )

        def _inner(*args, **kwargs):
            return PartialState().on_local_process(function, local_process_index)(*args, **kwargs)

        return _inner

    @contextmanager
    def main_process_first(self):
        """
        Lets the main process go first inside a with block.

        The other processes will enter the with block after the main process exits.

        Example:

        ```python
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator()
        >>> with accelerator.main_process_first():
        ...     # This will be printed first by process 0 then in a seemingly
        ...     # random order by the other processes.
        ...     print(f"This will be printed by process {accelerator.process_index}")
        ```
        """
        with self.state.main_process_first():
            yield

    @contextmanager
    def local_main_process_first(self):
        """
        Lets the local main process go inside a with block.

        The other processes will enter the with block after the main process exits.

        Example:

        ```python
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator()
        >>> with accelerator.local_main_process_first():
        ...     # This will be printed first by local process 0 then in a seemingly
        ...     # random order by the other processes.
        ...     print(f"This will be printed by process {accelerator.local_process_index}")
        ```
        """
        with self.state.local_main_process_first():
            yield

    @contextmanager
    def no_sync(self, model):
        """
        A context manager to disable gradient synchronizations across DDP processes by calling
        `torch.nn.parallel.DistributedDataParallel.no_sync`.

        If `model` is not in DDP, this context manager does nothing

        Args:
            model (`torch.nn.Module`):
                PyTorch Module that was prepared with `Accelerator.prepare`

        Example:

        ```python
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator()
        >>> dataloader, model, optimizer = accelerator.prepare(dataloader, model, optimizer)
        >>> input_a = next(iter(dataloader))
        >>> input_b = next(iter(dataloader))

        >>> with accelerator.no_sync():
        ...     outputs = model(input_a)
        ...     loss = loss_func(outputs)
        ...     accelerator.backward(loss)
        ...     # No synchronization across processes, only accumulate gradients
        >>> outputs = model(input_b)
        >>> accelerator.backward(loss)
        >>> # Synchronization across all processes
        >>> optimizer.step()
        >>> optimizer.zero_grad()
        ```
        """
        if self.is_fsdp2:
            model.set_requires_gradient_sync(False)
            try:
                yield
            finally:
                model.set_requires_gradient_sync(True)
        else:
            context = contextlib.nullcontext
            if self.use_distributed:
                if self.distributed_type != DistributedType.DEEPSPEED or self.state.deepspeed_plugin.zero_stage < 2:
                    context = getattr(model, "no_sync", context)

            with context():
                yield

    @staticmethod
    @contextmanager
    def trigger_sync_in_backward(model):
        """Trigger the sync of the gradients in the next backward pass of the model after multiple forward passes under
        `Accelerator.no_sync` (only applicable in multi-GPU scenarios).

                If the script is not launched in distributed mode, this context manager does nothing.

                Args:
                    model (`torch.nn.Module`):
                        The model for which to trigger the gradient synchronization.

                Example:

                ```python
                >>> from accelerate import Accelerator

                >>> accelerator = Accelerator()
                >>> dataloader, model, optimizer = accelerator.prepare(dataloader, model, optimizer)

                >>> with accelerator.no_sync():
                ...     loss_a = loss_func(model(input_a))  # first forward pass
                ...     loss_b = loss_func(model(input_b))  # second forward pass
                >>> accelerator.backward(loss_a)  # No synchronization across processes, only accumulate gradients
                >>> with accelerator.trigger_sync_in_backward(model):
                ...     accelerator.backward(loss_b)  # Synchronization across all processes
                >>> optimizer.step()
                >>> optimizer.zero_grad()
                ```
        """
        if not isinstance(model, torch.nn.parallel.DistributedDataParallel):
            yield
            return

        old_require_backward_grad_sync = model.require_backward_grad_sync
        old_require_forward_param_sync = model.require_forward_param_sync

        # EXPERIMENTAL: This will force grad sync during `backward()`, but it is unknown if it breaks other DDP features.
        # https://github.com/pytorch/pytorch/blob/e1502c0cdbfd17548c612f25d5a65b1e4b86224d/torch/nn/parallel/distributed.py#L1453-L1466
        model.require_backward_grad_sync = True
        model.require_forward_param_sync = True
        # https://github.com/pytorch/pytorch/blob/e1502c0cdbfd17548c612f25d5a65b1e4b86224d/torch/csrc/distributed/c10d/reducer.cpp#L1371-L1402
        model.reducer.prepare_for_backward([])
        try:
            yield
        finally:
            model.require_backward_grad_sync = old_require_backward_grad_sync
            model.require_forward_param_sync = old_require_forward_param_sync

    def _do_sync(self):
        "Sets the right `sync_gradients` context and either resets or increases `self.step`"
        if self.gradient_state.sync_with_dataloader and self.gradient_state.end_of_dataloader:
            self.step = 0
            self.gradient_state._set_sync_gradients(True)
        else:
            self.step += 1
            self.gradient_state._set_sync_gradients((self.step % self.gradient_state.num_steps) == 0)

    @property
    def sync_gradients(self):
        return self.gradient_state.sync_gradients

    @sync_gradients.setter
    def sync_gradients(self, sync_gradients):
        self.gradient_state.sync_gradients = sync_gradients

    @property
    def gradient_accumulation_steps(self):
        return self.gradient_state.num_steps

    @gradient_accumulation_steps.setter
    def gradient_accumulation_steps(self, gradient_accumulation_steps):
        self.gradient_state.plugin_kwargs.update({"num_steps": gradient_accumulation_steps})

    @contextmanager
    def accumulate(self, *models):
        """
        A context manager that will lightly wrap around and perform gradient accumulation automatically

        Args:
            *models (list of `torch.nn.Module`):
                PyTorch Modules that were prepared with `Accelerator.prepare`. Models passed to `accumulate()` will
                skip gradient syncing during backward pass in distributed training

        Example:

        ```python
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator(gradient_accumulation_steps=1)
        >>> dataloader, model, optimizer, scheduler = accelerator.prepare(dataloader, model, optimizer, scheduler)

        >>> for input, output in dataloader:
        ...     with accelerator.accumulate(model):
        ...         outputs = model(input)
        ...         loss = loss_func(outputs)
        ...         loss.backward()
        ...         optimizer.step()
        ...         scheduler.step()
        ...         optimizer.zero_grad()
        ```
        """
        self._do_sync()

        allow_gradient_sync = (
            self.sync_gradients  # must sync if sync gradients need to complete an optimizer step
            or (
                # the no_sync context stops the gradients from reducing during distributed training
                # bringing speedup (potentially at some costs). Here, no_sync can be prevented
                # by setting sync_each_batch = True.
                self.use_distributed  # only relevant in distributed settings
                and self.gradient_state.plugin_kwargs.get("sync_each_batch", False)
            )
        )
        with contextlib.ExitStack() as cm_stack:
            for m in models:
                cm_stack.enter_context(contextlib.nullcontext() if allow_gradient_sync else self.no_sync(m))
            yield

    @contextmanager
    def join_uneven_inputs(self, joinables, even_batches=None):
        """
        A context manager that facilitates distributed training or evaluation on uneven inputs, which acts as a wrapper
        around `torch.distributed.algorithms.join`. This is useful when the total batch size does not evenly divide the
        length of the dataset.

        Args:
            joinables (`list[torch.distributed.algorithms.Joinable]`):
                A list of models or optimizers that subclass `torch.distributed.algorithms.Joinable`. Most commonly, a
                PyTorch Module that was prepared with `Accelerator.prepare` for DistributedDataParallel training.
            even_batches (`bool`, *optional*)
                If set, this will override the value of `even_batches` set in the `Accelerator`. If it is not provided,
                the default `Accelerator` value wil be used.

        <Tip warning={true}>

        `join_uneven_inputs` is only supported for Distributed Data Parallel training on multiple GPUs. For any other
        configuration, this method will have no effect.

        </Tip>

        <Tip warning={true}>

        Overriding `even_batches` will not affect iterable-style data loaders.

        </Tip>

        Example:

        ```python
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator(even_batches=True)
        >>> ddp_model, optimizer, dataloader = accelerator.prepare(model, optimizer, dataloader)

        >>> with accelerator.join_uneven_inputs([ddp_model], even_batches=False):
        ...     for input, output in dataloader:
        ...         outputs = model(input)
        ...         loss = loss_func(outputs)
        ...         loss.backward()
        ...         optimizer.step()
        ...         optimizer.zero_grad()
        ```
        """
        if self.multi_device:
            dl_even_batches_values = []

            if even_batches is not None:
                iterable_dl_seen = False
                # override value in batch sampler for map-style datasets
                for dl_idx, dl in enumerate(self._dataloaders):
                    if isinstance(dl, DataLoaderDispatcher):
                        iterable_dl_seen = True
                        continue
                    dl_even_batches_values.append((dl_idx, dl.batch_sampler.even_batches))
                    dl.batch_sampler.even_batches = even_batches

                if iterable_dl_seen:
                    warnings.warn(
                        "Overriding even_batches is only supported for map-style datasets, yet some dataloaders given were iterable"
                    )
            else:
                even_batches = self.even_batches

            enable_join = False if even_batches else True
            try:
                with Join(joinables, enable=enable_join, throw_on_early_termination=False):
                    yield
            finally:
                # reset any batch samplers that have been modified
                for dl_idx, even_batches_value in dl_even_batches_values:
                    self._dataloaders[dl_idx].batch_sampler.even_batches = even_batches_value
        else:
            # Even when disabled, Join expects models to subclass Joinable, so skip entirely for single process runs
            if self.distributed_type != DistributedType.NO:
                warnings.warn(
                    "Joining uneven inputs is only supported for multi-GPU training, as a result `join_uneven_inputs` will have no effect."
                )

            with contextlib.nullcontext(joinables):
                yield

    def print(self, *args, **kwargs):
        """
        Drop in replacement of `print()` to only print once per server.

        Example:

        ```python
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator()
        >>> accelerator.print("Hello world!")
        ```
        """
        self.state.print(*args, **kwargs)

    def _prepare_one(self, obj, first_pass=False, device_placement=None):
        # First pass of preparation: DataLoader, model, optimizer
        if first_pass:
            if isinstance(obj, torch.utils.data.DataLoader):
                return self.prepare_data_loader(obj, device_placement=device_placement)
            elif isinstance(obj, torch.nn.Module):
                return self.prepare_model(obj, device_placement=device_placement)
            elif isinstance(obj, torch.optim.Optimizer):
                optimizer = self.prepare_optimizer(obj, device_placement=device_placement)
                return optimizer
        # Second pass of preparation: LR scheduler (which need the full list of optimizers)
        elif isinstance(obj, LRScheduler):
            scheduler = self.prepare_scheduler(obj)
            return scheduler
        # Return the unprocessed object if previous criteria was not met
        return obj

    def prepare(self, *args, device_placement=None):
        """
        Prepare all objects passed in `args` for distributed training and mixed precision, then return them in the same
        order.

        Args:
            *args (list of objects):
                Any of the following type of objects:

                - `torch.utils.data.DataLoader`: PyTorch Dataloader
                - `torch.nn.Module`: PyTorch Module
                - `torch.optim.Optimizer`: PyTorch Optimizer
                - `torch.optim.lr_scheduler.LRScheduler`: PyTorch LR Scheduler

            device_placement (`list[bool]`, *optional*):
                Used to customize whether automatic device placement should be performed for each object passed. Needs
                to be a list of the same length as `args`. Not compatible with DeepSpeed or FSDP.

        <Tip>

          You don't need to prepare a model if you only use it for inference without any kind of mixed precision

        </Tip>

        Examples:

        ```python
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator()
        >>> # Assume a model, optimizer, data_loader and scheduler are defined
        >>> model, optimizer, data_loader, scheduler = accelerator.prepare(model, optimizer, data_loader, scheduler)
        ```

        ```python
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator()
        >>> # Assume a model, optimizer, data_loader and scheduler are defined
        >>> device_placement = [True, True, False, False]
        >>> # Will place the first two items passed in automatically to the right device but not the last two.
        >>> model, optimizer, data_loader, scheduler = accelerator.prepare(
        ...     model, optimizer, data_loader, scheduler, device_placement=device_placement
        ... )
        ```
        """
        if device_placement is None:
            device_placement = [None for _ in args]
        elif self.distributed_type in (DistributedType.DEEPSPEED, DistributedType.MEGATRON_LM):
            raise ValueError("You can't customize device placements with DeepSpeed or Megatron-LM.")
        elif len(device_placement) != len(args):
            raise ValueError(
                f"`device_placement` should be a list with {len(args)} elements (the number of objects passed)."
            )

        for obj in args:
            # TODO: Look at enabling native TP training directly with a proper config
            if (
                isinstance(obj, torch.nn.Module)
                and self.verify_device_map(obj)
                and self.distributed_type != DistributedType.NO
                and os.environ.get("ACCELERATE_BYPASS_DEVICE_MAP", "false") != "true"
            ):
                raise ValueError(
                    "You can't train a model that has been loaded with `device_map='auto'` in any distributed mode."
                    " Please rerun your script specifying `--num_processes=1` or by launching with `python {{myscript.py}}`."
                )

        if self.distributed_type == DistributedType.DEEPSPEED:
            model_count = 0
            for obj in args:
                if isinstance(obj, torch.nn.Module):
                    model_count += 1
            if model_count > 1:
                raise AssertionError(
                    "You can't use same `Accelerator()` instance with multiple models when using DeepSpeed"
                )

        # On TPUs, putting the model on the XLA device will create new parameters, so the corresponding optimizer will
        # have parameters disconnected from the model (so no training :-( ).
        # If the model and optimizer have parameters on different devices we raise an error.
        if self.distributed_type == DistributedType.XLA:
            model_device, optimizer_device = self._get_devices()
            if model_device is not None and optimizer_device is not None and model_device != optimizer_device:
                raise ValueError(
                    "The model and the optimizer parameters are not on the same device, which probably means you "
                    "created an optimizer around your model **before** putting on the device. Make sure the line "
                    "model.to(device) is before the optimizer creation in your script or remove it entirely and use "
                    "the flag default value for `device_placement` in your `Accelerator` to let it handle that "
                    "part for you."
                )

        if self.is_fsdp2:
            model_count = 0
            optimizer_count = 0
            for i, obj in enumerate(args):
                if isinstance(obj, torch.nn.Module):
                    model_count += 1
                elif isinstance(obj, torch.optim.Optimizer):
                    optimizer_count += 1

            # This needs to be written as such, so that passing other objects other than models/optimizers doesn't raise an error
            if (model_count < 1 and optimizer_count > 0) or (model_count > 0 and optimizer_count < 1):
                raise ValueError(
                    "When using FSDP2, a model and optimizer must be passed together to `Accelerator.prepare()`"
                    " as the optimizer needs to have its parameters modified after the model is converted."
                )
            if model_count > 1:
                raise ValueError("Only one model is supported when using FSDP2")

        # If we're dealing with device placement, this deals with that by...
        tpu_should_fix_optimizer = self.device_placement and self.distributed_type == DistributedType.XLA

        if tpu_should_fix_optimizer:
            # 1. grabbing old model parameters
            old_named_params = self._get_named_parameters(*args, drop_refs=False)

        if self.parallelism_config and self.parallelism_config.tp_enabled:
            args = self._prepare_tp(*args)
            for item in args:
                if any(
                    item in container
                    for container in (self._dataloaders, self._models, self._optimizers, self._schedulers)
                ):
                    item._is_accelerate_prepared = True

        if self.parallelism_config and self.parallelism_config.cp_enabled:
            args = self._prepare_cp(*args)
        # for megatron-lm, we don't need to prepare TE AO at this moment
        if self.distributed_type != DistributedType.MEGATRON_LM:
            if self.fp8_backend == FP8BackendType.TE:
                args = self._prepare_te(*args)
            elif self.fp8_backend == FP8BackendType.AO:
                args = self._prepare_ao(*args)
        if self.distributed_type == DistributedType.DEEPSPEED:
            result = self._prepare_deepspeed(*args)
        elif self.distributed_type == DistributedType.MEGATRON_LM:
            result = self._prepare_megatron_lm(*args)
        elif self.is_fsdp2:
            result = self._prepare_fsdp2(*args)
        else:
            if self.fp8_backend == FP8BackendType.MSAMP:
                args, device_placement = self._prepare_msamp(*args, device_placement=device_placement)
            result = tuple(
                self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
            )
            result = tuple(self._prepare_one(obj, device_placement=d) for obj, d in zip(result, device_placement))
        if tpu_should_fix_optimizer:
            # 2. grabbing new model parameters
            new_named_params = self._get_named_parameters(*result)
            # 3. building a map from the first to the second
            mapping = {p: new_named_params[n] for n, p in old_named_params.items()}
            # 4. using that map to update the parameters of the optimizer
            for obj in result:
                if isinstance(obj, torch.optim.Optimizer):
                    obj._switch_parameters(mapping)

        for item in result:
            if any(
                item in container
                for container in (self._dataloaders, self._models, self._optimizers, self._schedulers)
            ):
                item._is_accelerate_prepared = True

        return result if len(result) > 1 else result[0]

    def _prepare_tp(self, *args):
        # First pass: prepare everything except schedulers (first_pass=True) and the model, which is prepared separately
        # below
        result = [
            self._prepare_one(obj, first_pass=True) if not isinstance(obj, torch.nn.Module) else obj for obj in args
        ]

        # Second pass: prepare schedulers
        result = [self._prepare_one(obj) if not isinstance(obj, torch.nn.Module) else obj for obj in result]

        for arg in args:
            if not isinstance(arg, torch.nn.Module):
                continue
            model = arg

            from torch.distributed.tensor import DTensor

            if not any(isinstance(p, DTensor) for p in model.parameters()):
                logger.warning(
                    "The model parameters are not sharded by DTensor, we skip the TP preparation. If you are using "
                    "a PreTrained model it is expected and this warning can be ignored."
                )
                return result

        # Now we prepare the model
        device_mesh = self.torch_device_mesh

        old_named_params = self._get_named_parameters(*tuple(result), drop_refs=True)

        from torch.distributed.tensor import DTensor

        if self.is_fsdp2:
            for arg in result:
                if not isinstance(arg, torch.nn.Module):
                    continue

                from torch.distributed.tensor import Replicate
                from transformers.integrations.tensor_parallel import ReplicateParallel

                model: torch.nn.Module = arg
                tp_plan = ReplicateParallel

                for name, param in model.named_parameters():
                    if isinstance(param, DTensor):
                        continue

                    dp = DTensor.from_local(param, device_mesh=device_mesh["tp"], placements=[Replicate()])
                    param_name, param_type = name.rsplit(".", 1)
                    module_to_tp = model.get_submodule(param_name)

                    tp_plan().prepare_module_tp(module_to_tp, device_mesh["tp"])
                    if not isinstance(dp, torch.nn.Parameter):
                        dp = torch.nn.Parameter(dp, requires_grad=param.requires_grad)
                    setattr(module_to_tp, param_type, dp)

        new_named_params = self._get_named_parameters(*tuple(result), drop_refs=False)
        # Build a map from old to new params
        mapping = {p: new_named_params[n] for n, p in old_named_params.items()}

        if not mapping:
            return result

        def _get_tensor_address(p):
            if isinstance(p, DTensor):
                return p._local_tensor.data_ptr()
            return p.data_ptr()

        for obj in result:
            if isinstance(obj, torch.optim.Optimizer):
                for param_group in obj.param_groups:
                    # Each param_group originally maps to model parameters (e.g., from model.parameters()).
                    # After _prepare_tp(), parameter references are replaced with DTensor instances.
                    # Therefore, we remap the parameter references to their new DTensor addresses
                    # so that the optimizer can correctly update the model parameters.
                    param_group["params"] = [mapping[_get_tensor_address(p)] for p in param_group["params"]]

        return result

    def _prepare_cp(self, *args):
        from torch.distributed.tensor.experimental import context_parallel
        from torch.distributed.tensor.experimental._attention import set_rotate_method

        cp_comm_strategy = self.parallelism_config.cp_handler.cp_comm_strategy
        set_rotate_method(cp_comm_strategy)

        self._cp_context = functools.partial(context_parallel, mesh=self.torch_device_mesh["cp"])

        for arg in args:
            if isinstance(arg, torch.nn.Module):
                _attach_context_parallel_hooks(arg)

        return args

    def _prepare_fsdp2(self, *args):
        # First pass: prepare everything except schedulers (and model, which is prepared separately below)
        result = [
            self._prepare_one(obj, first_pass=True) if not isinstance(obj, torch.nn.Module) else obj for obj in args
        ]

        # Second pass: prepare schedulers
        result = [self._prepare_one(obj) if not isinstance(obj, torch.nn.Module) else obj for obj in result]

        # Prepare the model
        model_index, model = None, None
        for i, obj in enumerate(result):
            if isinstance(obj, torch.nn.Module):
                model_index, model = i, obj

        # Invariant: if we have a model, we also have an optimizer (checked in `prepare`)
        if model_index is None:
            return tuple(result)

        # Needs to be done first, to make sure AC + fully_shard will work as expected
        self.state.fsdp_plugin.set_auto_wrap_policy(model)

        # Apply AC if needed
        if self.state.fsdp_plugin.activation_checkpointing:
            model = fsdp2_apply_ac(self, model)

        # Apply compile if needed, has to be *after* applying AC
        # Copied from: `accelerator.prepare_model` ~ L1804
        if self.state.dynamo_plugin.backend != DynamoBackend.NO and not is_compiled_module(model):
            if self.state.dynamo_plugin.use_regional_compilation:
                model = compile_regions(model, **self.state.dynamo_plugin.to_kwargs())
            else:
                model = torch.compile(model, **self.state.dynamo_plugin.to_kwargs())

        # Get old params and canonicalize - we canonicalize to have the mapping easy
        old_named_params = fsdp2_canonicalize_names(self._get_named_parameters(*tuple(result), drop_refs=True))

        # Swap the optimizer parameters with empty, so `fully_shard` after will not allocate too much memory
        from torch.distributed.tensor import DTensor

        for obj in result:
            if isinstance(obj, torch.optim.Optimizer):
                for param_group in obj.param_groups:
                    for i, p in enumerate(param_group["params"]):
                        # We drop a reference to the original param here, so that _move_states_to_device triggers a reallocation
                        # We reassign the data_ptr to the original param, so that we preserve the mapping to the new ones
                        param_group["params"][i] = torch.empty(1, dtype=p.dtype, device=p.device)
                        param_group["params"][i].data_ptr = (
                            p._local_tensor.data_ptr() if isinstance(p, DTensor) else p.data_ptr()
                        )

        self._models.append(model)

        # Prepare everything FSDP2 related for the model (except AC)
        model = fsdp2_prepare_model(self, model)

        # Remove the old model from the list
        if len(self._models) > 1 and (self._models[-2] is self._models[-1]):
            del self._models[-2]

        # Replace the old model with the new one (shouldn't be needed as everything should be in place)
        result[model_index] = model

        # Get new params and canonicalize
        new_named_params = fsdp2_canonicalize_names(self._get_named_parameters(*result))
        # Build a map from old to new params and handle missings gracefully
        mapping = {}
        missing_params = []
        for n, p in old_named_params.items():
            if n in new_named_params:
                mapping[p] = new_named_params[n]
            else:
                missing_params.append(n)

        if missing_params:
            # Common tied embedding parameter names
            tied_weight_names = ["lm_head.weight", "model.embed_tokens.weight", "transformer.wte.weight"]
            if any(name in missing_params for name in tied_weight_names):
                raise ValueError(
                    f"FSDP2 mapping failed (missing: {missing_params}). This is likely due to tied embeddings "
                    f"(config has tie_word_embeddings=True but checkpoint has separate weights).\n"
                    f"To fix, try: Set `model.config.tie_word_embeddings = False` after loading the model.\n"
                )
            raise KeyError(f"Parameters missing after FSDP2 wrapping: {missing_params}")

        # Update the optimizer parameters
        for obj in result:
            if isinstance(obj, torch.optim.Optimizer):
                fsdp2_switch_optimizer_parameters(obj, mapping)

        return result

    def prepare_model(
        self, model: torch.nn.Module, device_placement: bool | None = None, evaluation_mode: bool = False
    ):
        """
        Prepares a PyTorch model for training in any distributed setup. It is recommended to use
        [`Accelerator.prepare`] instead.

        Args:
            model (`torch.nn.Module`):
                A PyTorch model to prepare. You don't need to prepare a model if it is used only for inference without
                any kind of mixed precision
            device_placement (`bool`, *optional*):
                Whether or not to place the model on the proper device. Will default to `self.device_placement`.
            evaluation_mode (`bool`, *optional*, defaults to `False`):
                Whether or not to set the model for evaluation only, by just applying mixed precision and
                `torch.compile` (if configured in the `Accelerator` object).

        Example:

        ```python
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator()
        >>> # Assume a model is defined
        >>> model = accelerator.prepare_model(model)
        ```
        """
        if device_placement is None:
            device_placement = self.device_placement and self.distributed_type != DistributedType.FSDP

        self._models.append(model)

        # TODO: Look at enabling native TP training directly with a proper config
        if (
            self.verify_device_map(model)
            and self.distributed_type != DistributedType.NO
            and os.environ.get("ACCELERATE_BYPASS_DEVICE_MAP", "false") != "true"
        ):
            raise ValueError(
                "You can't train a model that has been loaded with `device_map='auto'` in any distributed mode."
                " Please rerun your script specifying `--num_processes=1` or by launching with `python {{myscript.py}}`."
            )

        if self.native_amp:
            model._original_forward = model.forward
            autocast_context = get_mixed_precision_context_manager(self.native_amp, self.autocast_handler)
            # NOTE: MS-AMP adds `__func__` already to `model.forward`, so we should always use `model.forward`
            if self.fp8_backend == FP8BackendType.MSAMP or not hasattr(model.forward, "__func__"):
                model_forward_func = model.forward
                model.forward = convert_outputs_to_fp32(autocast_context(model_forward_func))
            else:
                model_forward_func = model.forward.__func__
                new_forward = autocast_context(model_forward_func)
                model.forward = MethodType(new_forward, model)
                model.forward = MethodType(convert_outputs_to_fp32(model.forward.__func__), model)

        # We prepare TE after, allowing for bf16 autocast to happen first
        if self.fp8_backend == FP8BackendType.TE and not self.delayed_fp8_autocast:
            model = apply_fp8_autowrap(model, self.te_recipe_handler or self.fp8_recipe_handler)

        if (getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False)) and getattr(
            model, "hf_device_map", False
        ):
            model_devices = set(model.hf_device_map.values())
            if len(model_devices) > 1 and self.distributed_type != DistributedType.NO:
                raise ValueError(
                    "You can't train a model that has been loaded in 8-bit or 4-bit precision on multiple devices in any distributed mode."
                    " In order to use 8-bit or 4-bit models that have been loaded across multiple GPUs the solution is to use Naive Pipeline Parallelism."
                    " Therefore you should not specify that you are under any distributed regime in your accelerate config."
                )
            elif len(model_devices) == 1:
                current_device = list(model_devices)[0]
                if isinstance(current_device, torch.device):
                    current_device_index = current_device.index
                elif isinstance(current_device, str):
                    current_device_index = torch.device(current_device).index
                else:
                    current_device_index = current_device

                current_device_index = int(current_device_index) if current_device_index is not None else None
                if self.device.type == "cpu" and is_bitsandbytes_multi_backend_available():
                    # bnb with multi-backend supports CPU which don't need to check index.
                    pass
                elif torch.device(self.device.type, current_device_index) != self.device:
                    # if on the first device (GPU 0) we don't care
                    if (self.device.index is not None) or (current_device_index != 0):
                        raise ValueError(
                            "You can't train a model that has been loaded in 8-bit or 4-bit precision on a different device than the one "
                            "you're training on. Make sure you loaded the model on the correct device using for example `device_map={'':torch.cuda.current_device()}` or `device_map={'':torch.xpu.current_device()}`"
                        )
            if (
                ("cpu" in model_devices and not is_bitsandbytes_multi_backend_available())
                or ("cpu" in model_devices and is_xpu_available())
                or "disk" in model_devices
            ):
                raise ValueError(
                    "You can't train a model that has been loaded in 8-bit or 4-bit precision with CPU or disk offload. "
                    "If you want train the 8-bit or 4-bit model in CPU, please install bitsandbytes with multi-backend, see https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend"
                )
        elif device_placement and not self.verify_device_map(model):
            model = model.to(self.device)
        if not evaluation_mode:
            if self.multi_device and not (self.parallelism_config and self.parallelism_config.tp_enabled):
                if model_has_dtensor(model):
                    raise ValueError(
                        "Your model contains `DTensor` parameters, which is incompatible with DDP. Maybe you loaded your model with `device_map='auto'`? Specify `device_map='cuda'` or 'xpu' or 'cpu' instead."
                    )
                if any(p.requires_grad for p in model.parameters()):
                    kwargs = self.ddp_handler.to_kwargs() if self.ddp_handler is not None else {}
                    # TODO: Look at enabling native TP training directly with a proper config
                    if os.environ.get("ACCELERATE_BYPASS_DEVICE_MAP", "false") != "true":
                        if self.device.type == "hpu":
                            device_ids, output_device = [self.device.index], self.device.index
                        else:
                            device_ids, output_device = [self.local_process_index], self.local_process_index
                    else:
                        device_ids, output_device = None, None
                    model = torch.nn.parallel.DistributedDataParallel(
                        model, device_ids=device_ids, output_device=output_device, **kwargs
                    )
                    if self.ddp_handler is not None:
                        self.ddp_handler.register_comm_hook(model)
            elif self.parallelism_config and self.parallelism_config.tp_enabled:
                if not hasattr(model, "tp_size"):
                    raise NotImplementedError(
                        "Model should undergo tensor parallel before passing it to accelerate."
                        "You can use .from_pretrained(..., tp_plan='auto') if the model supports"
                    )
                if model.tp_size != self.parallelism_config.tp_size:
                    raise ValueError(
                        f"tp_size in the plugin {self.parallelism_config.tp_size} should be same as model's tp size {model.tp_size}"
                    )
            elif self.is_fsdp2:
                raise ValueError(
                    "FSDP2 preparation should be done via `accelerate.prepare()`, as it requires a model and an optimizer."
                )

            elif self.distributed_type == DistributedType.FSDP:
                # We need to fix the optimizer *before* sharding the model
                from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP

                # Check if the model is already a FSDP model due to `Manual Wrapping` and if so,
                # don't wrap it again
                # In case the model is already compiled using PyTorch 2.0 and the wrapped model in it
                # is a FSDP model, don't wrap it again
                is_type_fsdp = isinstance(model, FSDP) or (
                    is_compiled_module(model) and isinstance(model._orig_mod, FSDP)
                )

                if not is_type_fsdp:
                    self.state.fsdp_plugin.set_auto_wrap_policy(model)
                    fsdp_plugin = self.state.fsdp_plugin

                    # need to ensure that params are re-tied after running
                    # param_init_fn
                    fsdp_plugin.param_init_fn = ensure_weights_retied(
                        fsdp_plugin.param_init_fn,
                        model,
                        self.device,
                    )

                    kwargs = {
                        # We fallback to reshard_after_forward if sharding_strategy is not set.
                        # We prerfer sharding_strategy to not break the behavior of the existing code.
                        # Deprecation warning has already been issued in `utils.dataclasses.py`
                        "sharding_strategy": fsdp_plugin.sharding_strategy or fsdp_plugin.reshard_after_forward,
                        "cpu_offload": fsdp_plugin.cpu_offload,
                        "auto_wrap_policy": fsdp_plugin.auto_wrap_policy,
                        "mixed_precision": fsdp_plugin.mixed_precision_policy,
                        "sync_module_states": fsdp_plugin.sync_module_states,
                        "backward_prefetch": fsdp_plugin.backward_prefetch,
                        "forward_prefetch": fsdp_plugin.forward_prefetch,
                        "use_orig_params": fsdp_plugin.use_orig_params,
                        "param_init_fn": fsdp_plugin.param_init_fn,
                        "ignored_modules": fsdp_plugin.ignored_modules,
                        "limit_all_gathers": fsdp_plugin.limit_all_gathers,
                        "device_id": self.device,
                    }

                    if isinstance(kwargs["ignored_modules"], str):
                        reg = re.compile(kwargs["ignored_modules"])
                        ignored = []
                        for name, module in model.named_modules():
                            if reg.fullmatch(name):
                                # ensure that the device for these modules is still set correctly
                                module.to(self.device)
                                ignored.append(module)
                        kwargs["ignored_modules"] = ignored

                    model = FSDP(model, **kwargs)
                    if fsdp_plugin.activation_checkpointing:
                        from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
                            CheckpointImpl,
                            apply_activation_checkpointing,
                            checkpoint_wrapper,
                        )

                        apply_activation_checkpointing(
                            model,
                            checkpoint_wrapper_fn=functools.partial(
                                checkpoint_wrapper,
                                checkpoint_impl=CheckpointImpl.NO_REENTRANT,
                            ),
                            auto_wrap_policy=fsdp_plugin.auto_wrap_policy,
                        )

                # In the event the model had been loaded in low precision, but
                # mixed precision had also been activated, then we follow DeepSpeed's
                # strategy to hold the parameters in full precision.
                # - assume that trainer.args.bf16 and trainer.args.fp16 are already checked against
                #   fsdp_plugin.mixed_precision_policy.
                # - NOTE: we do not check the mixed_precision attribute on the FSDP root wrapper.
                #   * this attribute will always set by init_utils.init_core_state so its always not None.
                #   * mixed_precision.param_dtype only regards _fwd_bwd_param_dtype
                #   * if model is loaded in 16bit, and even if mixed_precision.param_dtype is None,
                #     we still want to upcast the flat_param.
                if self.mixed_precision != "no":  # if mixed precision is set
                    upcasted_log = []
                    for module in FSDP.fsdp_modules(model):
                        # Referencing DeepSpeed Zero3
                        # - in Init, params are converted to 16bit while partitioning.
                        # - in accelerator.prepare, deepspeed.initialize is called to:
                        #   * creates the DeepSpeedEngine.
                        #   * since zero_optimization() is True , calls engine._configure_zero_optimizer.
                        #
                        # Inside the DeepSpeed Zero3 optimizer configuration, which initializes
                        # DeepSpeedZeroOptimizer_Stage3, during which:
                        #   * trainable_param_groups are obtained from the attached optimizer
                        #     (already partitioned in 16bit).
                        #   * then _setup_for_real_optimizer -> _create_fp32_partitions
                        #     which performs the fp32 upcasting.

                        # To mimic DeepSeepds's casting in FSDP, we look at the (single) FlatParameter held
                        # within an FSDP wrapper. This FlatParameter will be seen by the optimizer.
                        #  - even though there is a torch.device('meta') guard below, we
                        #    expect _init_utils._init_param_handle_from_module to already
                        #    sync the parameter.

                        if not module._has_params:
                            continue  # skip if FSDP module not managing parameters
                        param = module._flat_param
                        if (
                            param.dtype != torch.float32
                            and param.device != torch.device("meta")
                            and param.requires_grad
                        ):
                            # keep log of names_params that was upcasted
                            # NOTE: resorted to this because warnings.simplefilter("once") is somehow not working
                            name_param_log = (module.module.__class__.__name__, ", ".join(module._flat_param._fqns))
                            if name_param_log not in upcasted_log:
                                upcasted_log.append(name_param_log)

                            # this works because of FSDP's _runtime_utils.lazy_init.
                            # Have to be careful not to call anything before this that
                            # triggers lazy_init (e.g., _is_fsdp_root).
                            param.data = param.data.to(torch.float32)  # upcasting
                            module._handle._orig_param_dtype = torch.float32  # update

                    # report the warnings
                    # some messages can be quite repetitive, especially when reporting about layers that have identical architecture.
                    if self.is_main_process:
                        for name_log, param_log in upcasted_log:
                            warnings.warn(
                                f"Upcasted low precision parameters in {name_log} because mixed precision turned on in FSDP. "
                                f"Affects: {param_log}."
                            )

                        if len(upcasted_log) > 0:
                            warnings.warn(
                                "FSDP upcast of low precision parameters may affect the precision of model checkpoints."
                            )

                # if the previous and current models are same, delete the previous one
                if len(self._models) > 1 and (self._models[-2] is self._models[-1]):
                    del self._models[-2]
                self._models[-1] = model
            elif self.distributed_type == DistributedType.MULTI_CPU:
                kwargs = self.ddp_handler.to_kwargs() if self.ddp_handler else {}
                model = torch.nn.parallel.DistributedDataParallel(model, **kwargs)
                if self.ddp_handler is not None:
                    self.ddp_handler.register_comm_hook(model)
            elif self.distributed_type == DistributedType.XLA and self.state.fork_launched:
                model = xmp.MpModelWrapper(model).to(self.device)
        # Now we can apply the FP8 autocast
        if self.fp8_backend == FP8BackendType.TE and self.delayed_fp8_autocast:
            model = apply_fp8_autowrap(model, self.te_recipe_handler or self.fp8_recipe_handler)
        # torch.compile should be called last and only if the model isn't already compiled
        if self.state.dynamo_plugin.backend != DynamoBackend.NO and not is_compiled_module(model):
            if self.state.dynamo_plugin.use_regional_compilation:
                model = compile_regions(model, **self.state.dynamo_plugin.to_kwargs())
            else:
                model = torch.compile(model, **self.state.dynamo_plugin.to_kwargs())
        return model

    def _prepare_ao(self, *args):
        if not is_torchao_available():
            raise ImportError(
                "`torchao` was not found on your system or is too old of a version. Please ensure that `torchao >= 0.6.1` is installed"
            )

        if self.is_fsdp2:
            models = [x for x in args if isinstance(x, torch.nn.Module)]
            optimizers = [x for x in args if isinstance(x, torch.optim.Optimizer)]
        for arg in args:
            if isinstance(arg, torch.nn.Module):
                convert_model_to_fp8_ao(
                    arg,
                    config=self.ao_recipe_handler.config,
                    module_filter_func=self.ao_recipe_handler.module_filter_func,
                )

        # Invariant: with FSDP2, optimizer is always passed to `prepare()` together with model
        # We only precompute scales if float8 all gather is enabled, possibly can add a flag for this later
        if self.is_fsdp2 and len(optimizers) > 0 and self.ao_recipe_handler.config.enable_fsdp_float8_all_gather:
            from torchao.float8 import precompute_float8_dynamic_scale_for_fsdp

            optimizers[0].register_step_post_hook(
                lambda *args, **kwargs: precompute_float8_dynamic_scale_for_fsdp(models[0])
            )

        return args

    def _prepare_te(self, *args):
        if not is_transformer_engine_available():
            raise ImportError(
                "`transformer_engine` was not found on your system. Please ensure that `transformer_engine` is installed"
            )
        model, optimizer = None, None
        num_models, num_optimizers = 0, 0
        result = [obj for obj in args]
        for obj in result:
            if isinstance(obj, torch.nn.Module):
                model = obj
                num_models += 1
            elif isinstance(obj, (torch.optim.Optimizer)):
                optimizer = obj
                num_optimizers += 1
        if optimizer is None and model is None:
            return result
        elif optimizer is None or model is None:
            raise ValueError(
                "You must pass a model and an optimizer together to `accelerate.prepare()` when using TransformerEngine."
            )
        elif num_models > 1 or num_optimizers > 1:
            raise ValueError(
                f"You can't use multiple models ({num_models}) or optimizers {num_optimizers} with TransformerEngine."
            )
        old_named_params = self._get_named_parameters(model)
        with torch.no_grad():
            convert_model(model)
        new_named_params = self._get_named_parameters(model)
        mapping = {p: new_named_params[n] for n, p in old_named_params.items()}
        # We need to switch the optimizer params to the new params *after* the model is wrapped in FSDP
        for param_group in optimizer.param_groups:
            param_group["params"] = [mapping[p] for p in param_group["params"]]

        return result

    def _prepare_deepspeed(self, *args):
        import deepspeed

        ds_initialize = deepspeed.initialize
        if self.fp8_backend == FP8BackendType.MSAMP:
            # MS-AMP requires DeepSpeed patches
            from msamp import deepspeed as msamp_deepspeed

            ds_initialize = msamp_deepspeed.initialize

        deepspeed_plugin = self.deepspeed_plugin

        is_dataloader_present = any(isinstance(obj, torch.utils.data.DataLoader) for obj in args)
        tp_size = deepspeed_plugin.deepspeed_config.get("tensor_parallel", {}).get("autotp_size", 0)

        sp_backend = self.parallelism_config.sp_backend if self.parallelism_config else None
        sp_size = self.parallelism_config.sp_size if self.parallelism_config else 1
        sp_handler = self.parallelism_config.sp_handler if self.parallelism_config else None

        if tp_size > 1:
            if not compare_versions("deepspeed", ">=", "0.16.4"):
                raise ImportError(
                    "Deepspeed TP requires deepspeed >= 0.16.4, Please update DeepSpeed via `pip install deepspeed -U`."
                )
            if not is_torch_version(">=", "2.2.0"):
                raise ImportError(
                    "Tried to use TP, but `torch.distributed.device_mesh` requires PyTorch >= 2.2.0. Please upgrade your PyTorch version"
                )
            from torch.distributed.device_mesh import init_device_mesh

            mesh_dim_name = "tp"
            self.state.ds_device_mesh = init_device_mesh(self.device.type, (tp_size,), mesh_dim_names=(mesh_dim_name,))

        result = [
            self._prepare_one(obj, first_pass=True) if isinstance(obj, torch.utils.data.DataLoader) else obj
            for obj in args
        ]

        if deepspeed_plugin.is_auto("train_micro_batch_size_per_gpu"):
            if is_dataloader_present:
                batch_sizes = [obj.batch_size for obj in args if hasattr(obj, "batch_size")]
                if any(bs is None for bs in batch_sizes):
                    raise ValueError(
                        "At least one of the dataloaders passed to `accelerate.prepare()` has `None` as batch size. "
                        "Please set an integer value in `train_micro_batch_size_per_gpu` in the deepspeed config file "
                        "or assign integer value to `AcceleratorState().deepspeed_plugin.deepspeed_config['train_micro_batch_size_per_gpu']`."
                    )
                if self.split_batches:
                    batch_sizes = [batch_size // self.num_processes for batch_size in batch_sizes]

                batch_size_per_device = min(batch_sizes) if deepspeed_plugin.is_train_batch_min else max(batch_sizes)
                if len(batch_sizes) > 1:
                    logger.info(
                        "Since you passed both train and evaluation dataloader, `is_train_batch_min` (here "
                        f"{deepspeed_plugin.is_train_batch_min} will decide the `train_batch_size` ({batch_size_per_device})."
                    )
            else:
                raise ValueError(
                    "When using DeepSpeed, `accelerate.prepare()` requires you to pass at least one of training or evaluation dataloaders "
                    "with `batch_size` attribute returning an integer value "
                    "or alternatively set an integer value in `train_micro_batch_size_per_gpu` in the deepspeed config file "
                    "or assign integer value to `AcceleratorState().deepspeed_plugin.deepspeed_config['train_micro_batch_size_per_gpu']`."
                )
        else:
            batch_size_per_device = deepspeed_plugin.get_value("train_micro_batch_size_per_gpu")

        # handle `gradient_accumulation_steps` when the value is `auto`
        deepspeed_plugin.fill_match(
            "gradient_accumulation_steps",
            must_match=False,
            gradient_accumulation_steps=self.gradient_accumulation_steps,
        )

        deepspeed_gradient_accumulation_steps = deepspeed_plugin.get_value("gradient_accumulation_steps")
        # update gradient_accumulation_steps if there is a mismatch
        if deepspeed_gradient_accumulation_steps != self.gradient_accumulation_steps:
            logger.warning(
                f"Gradient accumulation steps mismatch: GradientAccumulationPlugin has {self.gradient_accumulation_steps}, "
                f"DeepSpeed config has {deepspeed_gradient_accumulation_steps}. Using DeepSpeed's value."
            )
            self.gradient_accumulation_steps = deepspeed_gradient_accumulation_steps

        config_kwargs = {
            "gradient_clipping": 1.0,
            "zero_optimization.stage3_gather_16bit_weights_on_model_save": False,
        }
        # This block is skipped when preparing just a model and DL is absent from current call's args
        if batch_size_per_device is not None:
            config_kwargs["train_micro_batch_size_per_gpu"] = batch_size_per_device
            config_kwargs["train_batch_size"] = (
                batch_size_per_device
                * deepspeed_plugin.get_value("gradient_accumulation_steps")
                * self.num_processes
                // sp_size
            )

        model = None
        optimizer = None
        scheduler = None
        for obj in result:
            if isinstance(obj, torch.nn.Module):
                model = obj
            elif isinstance(obj, (torch.optim.Optimizer, DummyOptim)):
                optimizer = obj
            elif (isinstance(obj, (LRScheduler, DummyScheduler))) or (
                type(obj).__name__ in deepspeed.runtime.lr_schedules.VALID_LR_SCHEDULES
            ):
                scheduler = obj

        if optimizer is not None:
            if "optimizer" in deepspeed_plugin.deepspeed_config and not isinstance(optimizer, (DummyOptim)):
                raise ValueError(
                    "You cannot specify an optimizer in the config file and in the code at the same time. "
                    "Please remove the optimizer from the config file or "
                    "create `accelerate.utils.DummyOptim` in the code."
                )
            elif "optimizer" not in deepspeed_plugin.deepspeed_config and isinstance(optimizer, (DummyOptim)):
                raise ValueError(
                    "You cannot create a `DummyOptim` without specifying an optimizer in the config file."
                )

            if isinstance(optimizer, (torch.optim.Optimizer)):
                deepspeed_plugin.deepspeed_config["zero_allow_untested_optimizer"] = True

        if scheduler is not None:
            if "scheduler" in deepspeed_plugin.deepspeed_config and not isinstance(scheduler, (DummyScheduler)):
                raise ValueError(
                    "You cannot specify a scheduler in the config file and in the code at the same time. "
                    "Please remove the scheduler from the config file or "
                    "create `accelerate.utils.DummyScheduler` in the code."
                )
            elif (
                "scheduler" not in deepspeed_plugin.deepspeed_config
                and isinstance(scheduler, (DummyScheduler))
                and scheduler.lr_scheduler_callable is None
            ):
                raise ValueError(
                    "Either specify a scheduler in the config file or "
                    "pass in the `lr_scheduler_callable` parameter when using `accelerate.utils.DummyScheduler`."
                )

        if optimizer is not None and scheduler is not None:
            if isinstance(optimizer, (DummyOptim)) and not isinstance(scheduler, (DummyScheduler)):
                raise ValueError(
                    "You can only specify `accelerate.utils.DummyScheduler` in the code when using "
                    "`accelerate.utils.DummyOptim`."
                )

        if model is not None:
            # If we are using FP8, we need to apply the autowrap now
            if self.fp8_backend == FP8BackendType.TE:
                model = apply_fp8_autowrap(model, self.fp8_recipe_handler)
            # if the model is an MOE, set the appropriate MOE layers as leaf Z3 modules
            deepspeed_plugin.set_moe_leaf_modules(model)
            # deal with config keys that use `auto` value and rely on model's hidden_size
            hidden_size_based_keys = [
                "zero_optimization.reduce_bucket_size",
                "zero_optimization.stage3_prefetch_bucket_size",
                "zero_optimization.stage3_param_persistence_threshold",
            ]
            hidden_size_auto_keys = [x for x in hidden_size_based_keys if deepspeed_plugin.is_auto(x)]
            if len(hidden_size_auto_keys) > 0:
                reasoning = (
                    "therefore it's not possible to automatically fill out the following `auto` entries "
                    + f"in the DeepSpeed config file: {hidden_size_auto_keys}. You can fix that by replacing "
                    + "`auto` values for these keys with an integer value of your choice."
                )
                if not hasattr(model, "config"):
                    raise ValueError("Can't find `model.config` entry, " + reasoning)

                if hasattr(model.config, "hidden_size"):
                    hidden_size = model.config.hidden_size
                elif hasattr(model.config, "hidden_sizes"):
                    # if there are many hidden sizes pick the largest one
                    hidden_size = max(model.config.hidden_sizes)
                else:
                    raise ValueError(
                        "Can find neither `model.config.hidden_size` nor `model.config.hidden_sizes`, " + reasoning
                    )

                config_kwargs.update(
                    {
                        "zero_optimization.reduce_bucket_size": hidden_size * hidden_size,
                        "zero_optimization.stage3_prefetch_bucket_size": int(0.9 * hidden_size * hidden_size),
                        "zero_optimization.stage3_param_persistence_threshold": 10 * hidden_size,
                    }
                )

            if isinstance(optimizer, (DummyOptim)):
                config_kwargs.update(
                    {"optimizer.params.lr": optimizer.lr, "optimizer.params.weight_decay": optimizer.weight_decay}
                )
            if isinstance(scheduler, (DummyScheduler)) and scheduler.lr_scheduler_callable is None:
                max_lr = (
                    getattr(scheduler.optimizer, "lr", None)
                    if getattr(scheduler.optimizer, "defaults", None) is None
                    else scheduler.optimizer.defaults["lr"]
                )
                config_kwargs.update(
                    {
                        "scheduler.params.warmup_min_lr": 0,
                        "scheduler.params.warmup_max_lr": max_lr,
                        "scheduler.params.warmup_num_steps": scheduler.warmup_num_steps,
                    }
                )
                if scheduler.total_num_steps is not None:
                    config_kwargs["scheduler.params.total_num_steps"] = (
                        math.ceil(scheduler.total_num_steps / self.num_processes)
                        if not self.split_batches
                        else scheduler.total_num_steps
                    )

            deepspeed_plugin.deepspeed_config_process(must_match=False, **config_kwargs)
            self.deepspeed_config = deepspeed_plugin.deepspeed_config

            # note: batch_size derivation is all over the map, especiall in HF Trainer, so try to fix it at the last moment if needed
            pc = self.parallelism_config
            if pc is not None and pc.sp_backend == "deepspeed" and pc.sp_size > 1:
                self.deepspeed_config["train_batch_size"] = (
                    self.deepspeed_config["train_micro_batch_size_per_gpu"]
                    * self.deepspeed_config["gradient_accumulation_steps"]
                    * pc.data_parallel_size
                )

            kwargs = dict(model=model, config_params=self.deepspeed_config)
            if optimizer is not None:
                if isinstance(optimizer, (DummyOptim)):
                    kwargs["model_parameters"] = optimizer.params
                    if isinstance(scheduler, (DummyScheduler)) and scheduler.lr_scheduler_callable is not None:
                        kwargs["lr_scheduler"] = scheduler.lr_scheduler_callable
                else:
                    if self.deepspeed_config["zero_optimization"].get("offload_optimizer", {}).get(
                        "device", "none"
                    ) != "none" and self.deepspeed_config.get("zero_force_ds_cpu_optimizer", True):
                        if self.device.type == "hpu" and os.environ.get("PT_HPU_LAZY_MODE", "1") == "1":
                            raise ValueError(
                                "You can't use an Offload Optimizer with HPU in Lazy Mode. "
                                "Please set the environment variable `PT_HPU_LAZY_MODE` to `0`."
                            )

                        optimizer = map_pytorch_optim_to_deepspeed(optimizer)
                    kwargs["optimizer"] = optimizer
                    if scheduler is not None:
                        if type(scheduler).__name__ in deepspeed.runtime.lr_schedules.VALID_LR_SCHEDULES:
                            kwargs["lr_scheduler"] = scheduler

            if self.device.type == "hpu":
                # This env variable is initialized here to make sure it is set to "true"
                # It should be done by the launcher but it does not work for multi-node runs
                os.environ["DEEPSPEED_USE_HPU"] = "true"

            mpu = None
            if sp_size > 1:
                if sp_backend != "deepspeed":
                    raise ValueError(
                        f"In order to use the configured {sp_size=} with DeepSpeed, you need to configure sp_backend='deepspeed', yet you configured it to be {sp_backend=}."
                    )

                ver_min_required = "0.18.2"
                if not compare_versions("deepspeed", ">=", ver_min_required):
                    raise ImportError(
                        f"Deepspeed ALST/Ulysses requires deepspeed>={ver_min_required}. Please update DeepSpeed via `pip install deepspeed -U`."
                    )

                from deepspeed.runtime.sequence_parallel.ulysses_sp import (
                    UlyssesSPAttentionHF,
                    UlyssesSPDataLoaderAdapter,
                )

                if not hasattr(model, "config"):
                    raise ValueError(
                        "UlyssesSPAttentionHF currently works with HF Transformers and expects the model object to have a config attribute but this model doesn't have one."
                    )

                kwagrs = {}
                signature = inspect.signature(UlyssesSPAttentionHF.register_with_transformers)
                if "disable_in_eval" in signature.parameters.keys():
                    kwagrs["disable_in_eval"] = True

                mpu = UlyssesSPAttentionHF.register_with_transformers(
                    model_name_or_path=model,
                    sequence_parallel_size=sp_size,
                    seq_length=sp_handler.sp_seq_length,
                    seq_length_is_variable=sp_handler.sp_seq_length_is_variable,
                    core_attn_implementation=sp_handler.sp_attn_implementation,
                    micro_batch_size=batch_size_per_device,
                    **kwagrs,
                )
                kwargs["mpu"] = mpu

                for i in range(len(result)):
                    if isinstance(result[i], torch.utils.data.DataLoader):
                        if sp_size > 1:
                            # note that in case dataloader was prepared apart from model (for the external accelerator.prepare call) you'd need to call deepspeed_ulysses_dl_adapter after prepare(model) (see HF Trainer as the use-case)
                            sp_group = mpu.get_sequence_parallel_group()
                            sp_world_size = mpu.get_sequence_parallel_world_size()
                            sp_rank = mpu.get_sequence_parallel_rank()
                            result[i] = UlyssesSPDataLoaderAdapter(
                                result[i],
                                sp_rank=sp_rank,
                                sp_group=sp_group,
                                sp_world_size=sp_world_size,
                                device=self.device,  # model.device,
                            )

            engine, optimizer, _, lr_scheduler = ds_initialize(**kwargs)

            if compare_versions("deepspeed", ">=", "0.14.4") and self.state.dynamo_plugin.backend != DynamoBackend.NO:
                compile_kwargs = self.state.dynamo_plugin.to_kwargs()
                if self.state.dynamo_plugin.use_regional_compilation:
                    compile_regions_deepspeed(engine.module, **compile_kwargs)
                else:
                    engine.compile(backend=compile_kwargs.pop("backend"), compile_kwargs=compile_kwargs)
            if optimizer is not None:
                optimizer = DeepSpeedOptimizerWrapper(optimizer)
            if scheduler is not None:
                if lr_scheduler is None:
                    scheduler = AcceleratedScheduler(
                        scheduler,
                        optimizer,
                        step_with_optimizer=self.step_scheduler_with_optimizer,
                        split_batches=self.split_batches,
                    )
                else:
                    scheduler = DeepSpeedSchedulerWrapper(lr_scheduler, optimizer)

            for i in range(len(result)):
                if isinstance(result[i], torch.nn.Module):
                    result[i] = engine
                elif isinstance(result[i], (torch.optim.Optimizer, DummyOptim)):
                    result[i] = optimizer
                elif (isinstance(result[i], (LRScheduler, DummyScheduler))) or (
                    type(result[i]).__name__ in deepspeed.runtime.lr_schedules.VALID_LR_SCHEDULES
                ):
                    result[i] = scheduler

            # pointing for deepspeed_engine_wrapped.backward()
            if self.deepspeed_engine_wrapped is None:
                self.deepspeed_engine_wrapped = DeepSpeedEngineWrapper(engine)
            else:
                logger.warning(
                    "A wrapped DeepSpeed engine reference is currently tied for this `Accelerator()` instance. "
                    "If you want to call `accelerator.backward()` referencing a new model/engine, "
                    "please create a separate `Accelerator()` instance and call `accelerator.prepare()` on it."
                )
            self._models.append(engine)
            if optimizer is not None:
                self._optimizers.append(optimizer)
            if scheduler is not None:
                self._schedulers.append(scheduler)
        return tuple(result)

    def deepspeed_ulysses_dl_adapter(self, dl, model):
        """this is normally called as part of `prepare` but when dataloader was prepared apart from model (for the external accelerator.prepare call) this additional call needs to be made after prepare(model) (see HF Trainer as the use-case)"""
        sp_size = self.parallelism_config.sp_size if self.parallelism_config else 1
        if sp_size == 1:
            return dl
        from deepspeed.runtime.sequence_parallel.ulysses_sp import UlyssesSPDataLoaderAdapter
        from deepspeed.utils import groups

        sp_group = groups._get_sequence_parallel_group()
        sp_world_size = groups._get_sequence_parallel_world_size()
        sp_rank = groups._get_sequence_parallel_rank()
        dl = UlyssesSPDataLoaderAdapter(
            dl,
            sp_rank=sp_rank,
            sp_group=sp_group,
            sp_world_size=sp_world_size,
            device=model.device,
        )
        return dl

    def _prepare_megatron_lm(self, *args):
        megatron_lm_plugin = self.state.megatron_lm_plugin
        micro_batch_size = None
        if not megatron_lm_plugin.megatron_dataset_flag:
            batch_sizes = [obj.batch_size for obj in args if hasattr(obj, "batch_size")]
            if len(batch_sizes) == 0:
                raise ValueError(
                    "You must specify a training or evaluation dataloader in `accelerate.prepare()` when using Megatron-LM."
                )

            micro_batch_size = min(batch_sizes) if megatron_lm_plugin.is_train_batch_min else max(batch_sizes)
            if len(batch_sizes) > 1:
                logger.info(
                    "Since you passed both train and evaluation dataloader, `is_train_batch_min` (here "
                    f"{megatron_lm_plugin.is_train_batch_min} will decide the `train_batch_size` ({micro_batch_size})."
                )
        else:
            for obj in args:
                if isinstance(obj, MegatronLMDummyDataLoader):
                    micro_batch_size = obj.dataset_args["micro_batch_size"]
                    break
        if micro_batch_size is not None:
            dp_degree = self.num_processes // (megatron_lm_plugin.tp_degree * megatron_lm_plugin.pp_degree)
            megatron_lm_plugin.set_training_args(micro_batch_size, dp_degree)
        else:
            raise ValueError(
                "When you do not pass the dataloader parameter, the `data_parallel_size`, "
                "`micro_batch_size`, and `global_batch_size` megatron parameters will not be updated."
            )
        model = None
        optimizer = None
        scheduler = None
        batch_data = None
        for obj in args:
            if isinstance(obj, torch.utils.data.DataLoader) and batch_data is None:
                batch_data = next(iter(obj))
            elif isinstance(obj, torch.nn.Module):
                model = obj
            elif isinstance(obj, (torch.optim.Optimizer)):
                optimizer = obj
            elif isinstance(obj, (LRScheduler, MegatronLMDummyScheduler)):
                scheduler = obj

        if model is not None:
            megatron_lm_plugin.set_network_size_args(model, batch_data)
        if optimizer is not None:
            megatron_lm_plugin.set_optimizer_type(optimizer)
        if scheduler is not None:
            if not isinstance(scheduler, MegatronLMDummyScheduler):
                raise ValueError(
                    "You can't use a custom scheduler with Megatron-LM. Please use the `accelerate.utils.MegatronLMDummyScheduler` instead."
                )
            megatron_lm_plugin.set_scheduler_args(scheduler)

        # initialize megatron-lm
        megatron_lm_initialize(self, args_defaults=megatron_lm_plugin.megatron_lm_default_args)

        (model, optimizer, scheduler) = megatron_lm_prepare_model_optimizer_scheduler(self)
        self.wait_for_everyone()

        counter = 0
        result = []
        for obj in args:
            if isinstance(obj, torch.utils.data.DataLoader):
                result.append(megatron_lm_prepare_data_loader(self, obj))
                counter += 1
            elif isinstance(obj, MegatronLMDummyDataLoader):
                if counter == 0:
                    obj.set_megatron_data_args()
                    dataloaders = megatron_lm_prepare_data_loader(self, obj)
                result.append(dataloaders[counter])
                counter += 1
            else:
                result.append(obj)

        if model is not None:
            model = MegatronEngine(self, model, optimizer, scheduler)
        if optimizer is not None:
            optimizer = MegatronLMOptimizerWrapper(optimizer)
        if scheduler is not None:
            scheduler = MegatronLMSchedulerWrapper(scheduler, optimizer)

        for i in range(len(result)):
            if isinstance(result[i], torch.nn.Module):
                result[i] = model
            elif isinstance(result[i], torch.optim.Optimizer):
                result[i] = optimizer
            elif isinstance(result[i], MegatronLMDummyScheduler):
                result[i] = scheduler

        if model is not None:
            self._models.append(model)
            if len(self._models) > 1:
                raise AssertionError(
                    "You can't use same `Accelerator()` instance with multiple models when using Megatron-LM"
                )
        if optimizer is not None:
            self._optimizers.append(optimizer)
        if scheduler is not None:
            self._schedulers.append(scheduler)

        return tuple(result)

    def _prepare_device_mesh(self):
        """
        Prepare the device mesh for distributed training. The dataloader will determine how to load data based on the
        device mesh.
        """
        if self.distributed_type == DistributedType.DEEPSPEED and hasattr(self.state, "ds_device_mesh"):
            return self.state.ds_device_mesh
        else:
            return self.torch_device_mesh

    def _prepare_msamp(self, *args, device_placement):
        warnings.warn(
            "MS-AMP is deprecated and will be removed in a future version of Accelerate. "
            "Please use `'te'` (Transformer Engine) or `'torchao'` as the backend for FP8 "
            "mixed precision training instead.",
            FutureWarning,
        )
        if not is_msamp_available():
            raise ImportError(
                "MS-AMP was not found on your system. Please ensure that MS-AMP is available "
                " or choose `'te'` as the backend for FP8 mixed precision training."
            )
        # We've already checked for FSDP + MS-AMP during `__init__`
        import msamp

        model, optimizer = None, None
        optimizer_index = None
        num_models, num_optimizers = 0, 0
        result = [obj for obj in args]
        for i, obj in enumerate(result):
            if isinstance(obj, torch.nn.Module):
                model = obj
                num_models += 1
            elif isinstance(obj, (torch.optim.Optimizer)):
                optimizer = obj
                optimizer_index = i
                num_optimizers += 1
        # DataLoader/Scheduler case
        if optimizer is None and model is None:
            return result, device_placement
        elif optimizer is None or model is None:
            raise ValueError(
                "You must pass a model and an optimizer together to `accelerate.prepare()` when using MS-AMP."
            )
        elif num_models > 1 or num_optimizers > 1:
            raise ValueError(
                f"You can't use multiple models ({num_models}) or optimizers {num_optimizers} with MS-AMP."
            )
        else:
            # DEPRECATE @ 2.0
            if self.fp8_recipe_handler is not None:
                opt_level = self.fp8_recipe_handler.opt_level
            else:
                opt_level = self.msamp_recipe_handler.opt_level
            model, optimizer = msamp.initialize(model, optimizer, opt_level=opt_level)
        for i in range(len(result)):
            if isinstance(result[i], torch.nn.Module):
                result[i] = model
            elif isinstance(result[i], (torch.optim.Optimizer)):
                result[i] = optimizer
        if optimizer_index is not None:
            # NOTE: MS-AMP moves the optimizer, but *not* the model to the right device
            device_placement[optimizer_index] = False
        return tuple(result), device_placement

    def prepare_data_loader(
        self, data_loader: torch.utils.data.DataLoader, device_placement=None, slice_fn_for_dispatch=None
    ):
        """
        Prepares a PyTorch DataLoader for training in any distributed setup. It is recommended to use
        [`Accelerator.prepare`] instead.

        Args:
            data_loader (`torch.utils.data.DataLoader`):
                A vanilla PyTorch DataLoader to prepare
            device_placement (`bool`, *optional*):
                Whether or not to place the batches on the proper device in the prepared dataloader. Will default to
                `self.device_placement`.
            slice_fn_for_dispatch (`Callable`, *optional*`):
                If passed, this function will be used to slice tensors across `num_processes`. Will default to
                [`~utils.slice_tensors`]. This argument is used only when `dispatch_batches` is set to `True` and will
                be ignored otherwise.

        Example:

        ```python
        >>> import torch
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator()
        >>> data_loader = torch.utils.data.DataLoader(...)
        >>> data_loader = accelerator.prepare_data_loader(data_loader, device_placement=True)
        ```
        """
        # Ensure we can't double wrap a DataLoader due to `find_batch_size`
        if getattr(data_loader, "_is_accelerate_prepared", False):
            if data_loader not in self._dataloaders:
                self._dataloaders.append(data_loader)
            return data_loader
        if device_placement is None:
            device_placement = self.device_placement if self.distributed_type != DistributedType.XLA else False

        device_mesh = self._prepare_device_mesh()

        prepared_data_loader = prepare_data_loader(
            data_loader,
            self.device,
            num_processes=self.num_processes,
            process_index=self.process_index,
            split_batches=self.split_batches,
            put_on_device=device_placement,
            rng_types=self.rng_types.copy(),
            dispatch_batches=self.dispatch_batches,
            even_batches=self.even_batches,
            slice_fn_for_dispatch=slice_fn_for_dispatch,
            use_seedable_sampler=self.use_seedable_sampler,
            data_seed=self.dataloader_config.data_seed,
            non_blocking=self.non_blocking,
            use_stateful_dataloader=self.use_stateful_dataloader,
            torch_device_mesh=device_mesh,
        )
        self._dataloaders.append(prepared_data_loader)
        return prepared_data_loader

    def prepare_optimizer(self, optimizer: torch.optim.Optimizer, device_placement=None):
        """
        Prepares a PyTorch Optimizer for training in any distributed setup. It is recommended to use
        [`Accelerator.prepare`] instead.

        Args:
            optimizer (`torch.optim.Optimizer`):
                A vanilla PyTorch optimizer to prepare
            device_placement (`bool`, *optional*):
                Whether or not to place the optimizer on the proper device. Will default to `self.device_placement`.

        Example:

        ```python
        >>> import torch
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator()
        >>> optimizer = torch.optim.Adam(...)
        >>> optimizer = accelerator.prepare_optimizer(optimizer, device_placement=True)
        ```
        """
        if is_lomo_available():
            # We need to import locally to avoid circular imports since lomo imports stuff from
            # transformers & accelerate
            from lomo_optim import AdaLomo, Lomo

            # Support multiple optimizers: https://github.com/huggingface/accelerate/pull/2695#discussion_r1589164607
            self.has_lomo_optimizer |= isinstance(optimizer, (Lomo, AdaLomo))

        # Ensure we can't double wrap an optimizer due to `find_batch_size`
        if getattr(optimizer, "_is_accelerate_prepared", False):
            if optimizer not in self._optimizers:
                self._optimizers.append(optimizer)
            return optimizer
        if device_placement is None:
            device_placement = self.device_placement
        # NOTE: Special case with MS-AMP we do *not* pass in the scaler explicitly to the `AcceleratedOptimizer`,
        # Their optimizer handles it for us.
        scaler = None if self.fp8_backend == FP8BackendType.MSAMP else self.scaler
        optimizer = AcceleratedOptimizer(optimizer, device_placement=device_placement, scaler=scaler)
        self._optimizers.append(optimizer)
        return optimizer

    def prepare_scheduler(self, scheduler: LRScheduler):
        """
        Prepares a PyTorch Scheduler for training in any distributed setup. It is recommended to use
        [`Accelerator.prepare`] instead.

        Args:
            scheduler (`torch.optim.lr_scheduler.LRScheduler`):
                A vanilla PyTorch scheduler to prepare

        Example:

        ```python
        >>> import torch
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator()
        >>> optimizer = torch.optim.Adam(...)
        >>> scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, ...)
        >>> scheduler = accelerator.prepare_scheduler(scheduler)
        ```
        """
        # Ensure we can't double wrap a scheduler due to `find_batch_size`
        if getattr(scheduler, "_is_accelerate_prepared", False):
            if scheduler not in self._schedulers:
                self._schedulers.append(scheduler)
            return scheduler
        # We try to find the optimizer associated with `scheduler`, the default is the full list.
        optimizer = self._optimizers
        for opt in self._optimizers:
            if getattr(scheduler, "optimizer", None) == opt.optimizer:
                optimizer = opt
                break
        scheduler = AcceleratedScheduler(
            scheduler,
            optimizer,
            step_with_optimizer=self.step_scheduler_with_optimizer,
            split_batches=self.split_batches,
        )
        self._schedulers.append(scheduler)
        return scheduler

    def backward(self, loss, **kwargs):
        """
        Scales the gradients in accordance to the `GradientAccumulationPlugin` and calls the correct `backward()` based
        on the configuration.

        Should be used in lieu of `loss.backward()`.

        Example:

        ```python
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator(gradient_accumulation_steps=2)
        >>> outputs = model(inputs)
        >>> loss = loss_fn(outputs, labels)
        >>> accelerator.backward(loss)
        ```
        """
        learning_rate = kwargs.get("learning_rate")

        if self.distributed_type != DistributedType.DEEPSPEED:
            # deepspeed handles loss scaling by gradient_accumulation_steps in its `backward`
            loss = loss / self.gradient_accumulation_steps
        if self.distributed_type == DistributedType.DEEPSPEED:
            self.deepspeed_engine_wrapped.backward(loss, sync_gradients=self.sync_gradients, **kwargs)
        elif self.distributed_type == DistributedType.MEGATRON_LM:
            return
        elif self.scaler is not None:
            self.scaler.scale(loss).backward(**kwargs)
        elif learning_rate is not None and self.has_lomo_optimizer:
            self.lomo_backward(loss, learning_rate)
        else:
            loss.backward(**kwargs)

    def set_trigger(self):
        """
        Sets the internal trigger tensor to 1 on the current process. A latter check should follow using this which
        will check across all processes.

        Note:
            Does not require `wait_for_everyone()`

        Example:

        ```python
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator()
        >>> # Assume later in the training script
        >>> # `should_do_breakpoint` is a custom function to monitor when to break,
        >>> # e.g. when the loss is NaN
        >>> if should_do_breakpoint(loss):
        ...     accelerator.set_trigger()
        >>> # Assume later in the training script
        >>> if accelerator.check_breakpoint():
        ...     break
        ```
        """
        self.flag_tensor = torch.tensor(1, device=self.device)

    def check_trigger(self):
        """
        Checks if the internal trigger tensor has been set to 1 in any of the processes. If so, will return `True` and
        reset the trigger tensor to 0.

        Note:
            Does not require `wait_for_everyone()`

        Example:

        ```python
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator()
        >>> # Assume later in the training script
        >>> # `should_do_breakpoint` is a custom function to monitor when to break,
        >>> # e.g. when the loss is NaN
        >>> if should_do_breakpoint(loss):
        ...     accelerator.set_trigger()
        >>> # Assume later in the training script
        >>> if accelerator.check_trigger():
        ...     break
        ```
        """
        # Now that we are outside `__init__`, we can initialize it if it is `None` on device
        if self.flag_tensor is None:
            self.flag_tensor = torch.tensor(0, device=self.device)
        flag_tensor = self.reduce(self.flag_tensor)
        if flag_tensor.item() >= 1:
            self.flag_tensor = torch.tensor(0, device=self.device)
            return True
        return False

    def unscale_gradients(self, optimizer=None):
        """
        Unscale the gradients in mixed precision training with AMP. This is a noop in all other settings.

        Likely should be called through [`Accelerator.clip_grad_norm_`] or [`Accelerator.clip_grad_value_`]

        Args:
            optimizer (`torch.optim.Optimizer` or `list[torch.optim.Optimizer]`, *optional*):
                The optimizer(s) for which to unscale gradients. If not set, will unscale gradients on all optimizers
                that were passed to [`~Accelerator.prepare`].

        Example:

        ```python
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator()
        >>> model, optimizer = accelerator.prepare(model, optimizer)
        >>> outputs = model(inputs)
        >>> loss = loss_fn(outputs, labels)
        >>> accelerator.backward(loss)
        >>> accelerator.unscale_gradients(optimizer=optimizer)
        ```
        """
        if self.native_amp and self.mixed_precision == "fp16":
            if optimizer is None:
                # TODO: this unscales all optimizers where we should only unscale the one where parameters are.
                optimizer = self._optimizers
            elif not isinstance(optimizer, (tuple, list)):
                optimizer = [optimizer]
            for opt in optimizer:
                while isinstance(opt, AcceleratedOptimizer):
                    opt = opt.optimizer
                self.scaler.unscale_(opt)

    def clip_grad_norm_(self, parameters, max_norm, norm_type=2):
        """
        Should be used in place of `torch.nn.utils.clip_grad_norm_`.

        Returns:
            `torch.Tensor`: Total norm of the parameter gradients (viewed as a single vector).

        Example:

        ```python
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator(gradient_accumulation_steps=2)
        >>> dataloader, model, optimizer, scheduler = accelerator.prepare(dataloader, model, optimizer, scheduler)

        >>> for input, target in dataloader:
        ...     optimizer.zero_grad()
        ...     output = model(input)
        ...     loss = loss_func(output, target)
        ...     accelerator.backward(loss)
        ...     if accelerator.sync_gradients:
        ...         accelerator.clip_grad_norm_(model.parameters(), max_grad_norm)
        ...     optimizer.step()
        ```
        """
        if self.distributed_type == DistributedType.FSDP:
            self.unscale_gradients()
            parameters = [p for p in parameters]
            for model in self._models:
                if parameters == [p for p in model.parameters()]:
                    if not self.is_fsdp2:
                        return model.clip_grad_norm_(max_norm, norm_type)
                    else:
                        return torch.nn.utils.clip_grad_norm_(
                            parameters, max_norm, norm_type=norm_type
                        )  # viz: https://github.com/pytorch/torchtitan/blob/main/docs/fsdp.md
        elif self.distributed_type == DistributedType.DEEPSPEED:
            # DeepSpeed handles gradient clipping internally, but we can retrieve the gradient norm
            if self.deepspeed_engine_wrapped is not None:
                return self.deepspeed_engine_wrapped.get_global_grad_norm()
            return None
        elif self.distributed_type == DistributedType.XLA:
            # Reduce gradients first for XLA
            for acc_opt in self._optimizers:
                if not acc_opt.gradient_state.is_xla_gradients_synced:
                    opt = acc_opt
                    while isinstance(opt, AcceleratedOptimizer):
                        opt = opt.optimizer
                    gradients = xm._fetch_gradients(opt)
                    # Use xm.all_reduce to perform an in-place all-reduce. Recursive all-reduce each tensor
                    # one by one in self.reduce is non-inplace.
                    xm.all_reduce("sum", gradients, scale=1.0 / self.num_processes)
                    # Set is_xla_gradients_synced to True to avoid all-reduce twice in the AcceleratedOptimizer step.
                    acc_opt.gradient_state.is_xla_gradients_synced = True
            if os.environ.get("ACCELERATE_USE_FSDP", "false").lower() == "true":
                self.unscale_gradients()
                parameters = [p for p in parameters]
                for model in self._models:
                    if parameters == [p for p in model.parameters()]:
                        return model.clip_grad_norm_(max_norm, norm_type)
        self.unscale_gradients()
        return torch.nn.utils.clip_grad_norm_(parameters, max_norm, norm_type=norm_type)

    def clip_grad_value_(self, parameters, clip_value):
        """
        Should be used in place of `torch.nn.utils.clip_grad_value_`.

        Example:

        ```python
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator(gradient_accumulation_steps=2)
        >>> dataloader, model, optimizer, scheduler = accelerator.prepare(dataloader, model, optimizer, scheduler)

        >>> for input, target in dataloader:
        ...     optimizer.zero_grad()
        ...     output = model(input)
        ...     loss = loss_func(output, target)
        ...     accelerator.backward(loss)
        ...     if accelerator.sync_gradients:
        ...         accelerator.clip_grad_value_(model.parameters(), clip_value)
        ...     optimizer.step()
        ```
        """
        if self.distributed_type in [DistributedType.DEEPSPEED, DistributedType.FSDP]:
            raise Exception("DeepSpeed and FSDP  do not support `clip_grad_value_`. Use `clip_grad_norm_` instead.")
        self.unscale_gradients()
        torch.nn.utils.clip_grad_value_(parameters, clip_value)

    def gather(self, tensor):
        """
        Gather the values in *tensor* across all processes and concatenate them on the first dimension. Useful to
        regroup the predictions from all processes when doing evaluation.

        Note:
            This gather happens in all processes.

        Args:
            tensor (`torch.Tensor`, or a nested tuple/list/dictionary of `torch.Tensor`):
                The tensors to gather across all processes.

        Returns:
            `torch.Tensor`, or a nested tuple/list/dictionary of `torch.Tensor`: The gathered tensor(s). Note that the
            first dimension of the result is *num_processes* multiplied by the first dimension of the input tensors.

        Example:

        ```python
        >>> # Assuming four processes
        >>> import torch
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator()
        >>> process_tensor = torch.tensor([accelerator.process_index], device=accelerator.device)
        >>> gathered_tensor = accelerator.gather(process_tensor)
        >>> gathered_tensor
        tensor([0, 1, 2, 3])
        ```
        """
        return gather(tensor)

    def gather_for_metrics(self, input_data, use_gather_object=False):
        """
        Gathers `input_data` and potentially drops duplicates in the last batch if on a distributed system. Should be
        used for gathering the inputs and targets for metric calculation.

        Args:
            input (`torch.Tensor`, `object`, a nested tuple/list/dictionary of `torch.Tensor`, or a nested tuple/list/dictionary of `object`):
                The tensors or objects for calculating metrics across all processes
            use_gather_object(`bool`):
                Whether to forcibly use gather_object instead of gather (which is already done if all objects passed do
                not contain tensors). This flag can be useful for gathering tensors with different sizes that we don't
                want to pad and concatenate along the first dimension. Using it with GPU tensors is not well supported
                and inefficient as it incurs GPU -> CPU transfer since tensors would be pickled.

        Example:

        ```python
        >>> # Assuming two processes, with a batch size of 5 on a dataset with 9 samples
        >>> import torch
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator()
        >>> dataloader = torch.utils.data.DataLoader(range(9), batch_size=5)
        >>> dataloader = accelerator.prepare(dataloader)
        >>> batch = next(iter(dataloader))
        >>> gathered_items = accelerator.gather_for_metrics(batch)
        >>> len(gathered_items)
        9
        ```
        """

        try:
            recursively_apply(lambda x: x, input_data, error_on_other_type=True)
            all_tensors = True
        except TypeError:
            all_tensors = False

        use_gather_object = use_gather_object or not all_tensors

        if use_gather_object:
            data = gather_object(input_data)
        else:
            data = self.gather(input_data)

        try:
            if self.gradient_state.end_of_dataloader:
                # at the end of a dataloader, `gather_for_metrics` regresses to
                # `gather` unless the dataset has a remainder so log.
                if self.gradient_state.remainder == -1:
                    logger.info(
                        "The used dataset had no length, returning gathered tensors. You should drop the remainder yourself."
                    )
                    return data
                elif self.gradient_state.remainder > 0:
                    # Last batch needs to be truncated on distributed systems as it contains additional samples
                    def _adjust_samples(tensor):
                        return tensor[: self.gradient_state.remainder]

                    if use_gather_object:
                        # gather_object put the objects in a list
                        return _adjust_samples(data)
                    else:
                        return recursively_apply(_adjust_samples, data)
                else:  # remainder is 0
                    # no remainder even though at end of dataloader, so nothing to do.
                    return data
            else:
                # Not at the end of the dataloader, no need to adjust the tensors
                return data
        except Exception:
            # Dataset had no length or raised an error
            return data

    def reduce(self, tensor, reduction="sum", scale=1.0):
        """
        Reduce the values in *tensor* across all processes based on *reduction*.

        Note:
            All processes get the reduced value.

        Args:
            tensor (`torch.Tensor`, or a nested tuple/list/dictionary of `torch.Tensor`):
                The tensors to reduce across all processes.
            reduction (`str`, *optional*, defaults to "sum"):
                A reduction type, can be one of 'sum', 'mean', or 'none'. If 'none', will not perform any operation.
            scale (`float`, *optional*, defaults to 1.0):
                A default scaling value to be applied after the reduce, only valid on XLA.

        Returns:
            `torch.Tensor`, or a nested tuple/list/dictionary of `torch.Tensor`:
                The reduced tensor(s).

        Example:

        ```python
        >>> # Assuming two processes
        >>> import torch
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator()
        >>> process_tensor = torch.arange(accelerator.num_processes) + 1 + (2 * accelerator.process_index)
        >>> process_tensor = process_tensor.to(accelerator.device)
        >>> reduced_tensor = accelerator.reduce(process_tensor, reduction="sum")
        >>> reduced_tensor
        tensor([4, 6])
        ```
        """
        return reduce(tensor, reduction, scale)

    def pad_across_processes(self, tensor, dim=0, pad_index=0, pad_first=False):
        """
        Recursively pad the tensors in a nested list/tuple/dictionary of tensors from all devices to the same size so
        they can safely be gathered.

        Args:
            tensor (nested list/tuple/dictionary of `torch.Tensor`):
                The data to gather.
            dim (`int`, *optional*, defaults to 0):
                The dimension on which to pad.
            pad_index (`int`, *optional*, defaults to 0):
                The value with which to pad.
            pad_first (`bool`, *optional*, defaults to `False`):
                Whether to pad at the beginning or the end.

        Returns:
            `torch.Tensor`, or a nested tuple/list/dictionary of `torch.Tensor`:
                The padded tensor(s).

        Example:

        ```python
        >>> # Assuming two processes, with the first processes having a tensor of size 1 and the second of size 2
        >>> import torch
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator()
        >>> process_tensor = torch.arange(accelerator.process_index + 1).to(accelerator.device)
        >>> padded_tensor = accelerator.pad_across_processes(process_tensor)
        >>> padded_tensor.shape
        torch.Size([2])
        ```
        """
        return pad_across_processes(tensor, dim=dim, pad_index=pad_index, pad_first=pad_first)

    def unwrap_model(self, model, keep_fp32_wrapper: bool = True, keep_torch_compile: bool = True):
        """
        Unwraps the `model` from the additional layer possible added by [`~Accelerator.prepare`]. Useful before saving
        the model.

        Args:
            model (`torch.nn.Module`):
                The model to unwrap.
            keep_fp32_wrapper (`bool`, *optional*, defaults to `True`):
                Whether to not remove the mixed precision hook if it was added.
            keep_torch_compile (`bool`, *optional*, defaults to `True`):
                Whether to not unwrap compiled model if compiled.
        Returns:
            `torch.nn.Module`: The unwrapped model.

        Example:

        ```python
        >>> # Assuming two GPU processes
        >>> from torch.nn.parallel import DistributedDataParallel
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator()
        >>> model = accelerator.prepare(MyModel())
        >>> print(model.__class__.__name__)
        DistributedDataParallel

        >>> model = accelerator.unwrap_model(model)
        >>> print(model.__class__.__name__)
        MyModel
        ```
        """
        return extract_model_from_parallel(model, keep_fp32_wrapper, keep_torch_compile)

    def wait_for_everyone(self):
        """
        Will stop the execution of the current process until every other process has reached that point (so this does
        nothing when the script is only run in one process). Useful to do before saving a model.

        Example:

        ```python
        >>> # Assuming two GPU processes
        >>> import time
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator()
        >>> if accelerator.is_main_process:
        ...     time.sleep(2)
        >>> else:
        ...     print("I'm waiting for the main process to finish its sleep...")
        >>> accelerator.wait_for_everyone()
        >>> # Should print on every process at the same time
        >>> print("Everyone is here")
        ```
        """
        wait_for_everyone()

    @on_main_process
    def init_trackers(self, project_name: str, config: dict | None = None, init_kwargs: dict | None = {}):
        """
        Initializes a run for all trackers stored in `self.log_with`, potentially with starting configurations

        Args:
            project_name (`str`):
                The name of the project. All trackers will save their data based on this
            config (`dict`, *optional*):
                Optional starting configuration to be logged.
            init_kwargs (`dict`, *optional*):
                A nested dictionary of kwargs to be passed to a specific tracker's `__init__` function. Should be
                formatted like so:
                ```python
                {"wandb": {"tags": ["tag_a", "tag_b"]}}
                ```

        Example:

        ```python
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator(log_with="tensorboard")
        >>> accelerator.init_trackers(
        ...     project_name="my_project",
        ...     config={"learning_rate": 0.001, "batch_size": 32},
        ...     init_kwargs={"tensorboard": {"flush_secs": 60}},
        ... )
        ```
        """
        for tracker in self.log_with:
            if issubclass(type(tracker), GeneralTracker):
                # Custom trackers are already initialized
                self.trackers.append(tracker)
            else:
                tracker_init = LOGGER_TYPE_TO_CLASS[str(tracker)]
                if tracker_init.requires_logging_directory:
                    # We can skip this check since it was done in `__init__`
                    self.trackers.append(
                        tracker_init(project_name, self.logging_dir, **init_kwargs.get(str(tracker), {}))
                    )
                else:
                    self.trackers.append(tracker_init(project_name, **init_kwargs.get(str(tracker), {})))

        for tracker in self.trackers:
            tracker.start()

        if config is not None:
            for tracker in self.trackers:
                tracker.store_init_configuration(config)

    def get_tracker(self, name: str, unwrap: bool = False):
        """
        Returns a `tracker` from `self.trackers` based on `name` on the main process only.

        Args:
            name (`str`):
                The name of a tracker, corresponding to the `.name` property.
            unwrap (`bool`):
                Whether to return the internal tracking mechanism or to return the wrapped tracker instead
                (recommended).

        Returns:
            `GeneralTracker`: The tracker corresponding to `name` if it exists.

        Example:

        ```python
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator(log_with="tensorboard")
        >>> accelerator.init_trackers("my_project")
        >>> tensorboard_tracker = accelerator.get_tracker("tensorboard")
        ```
        """
        if len(self.trackers) > 0:
            for tracker in self.trackers:
                if tracker.name == name:
                    return tracker.tracker if unwrap else tracker
            raise ValueError(f"{name} is not an available tracker stored inside the `Accelerator`.")
        # Handle tracker only made on main process
        return GeneralTracker(_blank=True)

    @on_main_process
    def log(self, values: dict, step: int | None = None, log_kwargs: dict | None = {}):
        """
        Logs `values` to all stored trackers in `self.trackers` on the main process only.

        Args:
            values (`dict`):
                Values should be a dictionary-like object containing only types `int`, `float`, or `str`.
            step (`int`, *optional*):
                The run step. If included, the log will be affiliated with this step.
            log_kwargs (`dict`, *optional*):
                A nested dictionary of kwargs to be passed to a specific tracker's `log` function. Should be formatted
                like so:
                ```python
                {"wandb": {"tags": ["tag_a", "tag_b"]}}
                ```

        Example:

        ```python
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator(log_with="tensorboard")
        >>> accelerator.init_trackers("my_project")
        >>> accelerator.log({"loss": 0.5, "accuracy": 0.9})
        ```
        """
        for tracker in self.trackers:
            tracker.log(values, step=step, **log_kwargs.get(tracker.name, {}))

    def end_training(self):
        """
        Runs any special end training behaviors, such as stopping trackers on the main process only or destoying
        process group. Should always be called at the end of your script if using experiment tracking.

        Example:

        ```python
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator(log_with="tensorboard")
        >>> accelerator.init_trackers("my_project")
        >>> # Do training
        >>> accelerator.end_training()
        ```
        """
        for tracker in self.trackers:
            tracker.finish()

        self.state.destroy_process_group()

    def save(self, obj, f, safe_serialization=False):
        """
        Save the object passed to disk once per machine. Use in place of `torch.save`.

        Args:
            obj (`object`): The object to save.
            f (`str` or `os.PathLike`): Where to save the content of `obj`.
            safe_serialization (`bool`, *optional*, defaults to `False`): Whether to save `obj` using `safetensors`

        Note:
            If `save_on_each_node` was passed in as a `ProjectConfiguration`, will save the object once per node,
            rather than only once on the main node.

        Example:

        ```python
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator()
        >>> arr = [0, 1, 2, 3]
        >>> accelerator.save(arr, "array.pkl")
        ```
        """
        save(
            obj,
            f,
            save_on_each_node=self.project_configuration.save_on_each_node,
            safe_serialization=safe_serialization,
        )

    def save_model(
        self,
        model: torch.nn.Module,
        save_directory: Union[str, os.PathLike],
        max_shard_size: Union[int, str] = "10GB",
        safe_serialization: bool = True,
    ):
        """
        Save a model so that it can be re-loaded using load_checkpoint_in_model

        Arguments:
            model: (`torch.nn.Module`):
                Model to be saved. The model can be wrapped or unwrapped.
            save_directory (`str` or `os.PathLike`):
                Directory to which to save. Will be created if it doesn't exist.
            max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`):
                The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size
                lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5MB"`).

                <Tip warning={true}>

                If a single weight of the model is bigger than `max_shard_size`, it will be in its own checkpoint shard
                which will be bigger than `max_shard_size`.

                </Tip>

            safe_serialization (`bool`, *optional*, defaults to `True`):
                Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).

        Example:

        ```python
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator()
        >>> model = ...
        >>> accelerator.save_model(model, save_directory)
        ```
        """

        if os.path.isfile(save_directory):
            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
            return

        # get the state_dict of the model
        if any(has_offloaded_params(module) for module in model.modules()):
            state_dict = get_state_dict_offloaded_model(model)
        else:
            if any(param.device == torch.device("meta") for param in model.parameters()):
                raise RuntimeError("You can't save the model since some parameters are on the meta device.")
            state_dict = self.get_state_dict(model)

        # Case: DeepSpeed zero3 gets gathered and `state_dict` is empty
        if state_dict is None:
            return
        os.makedirs(save_directory, exist_ok=True)

        if safe_serialization:
            state_dict = clean_state_dict_for_safetensors(state_dict)
        weights_name = SAFE_WEIGHTS_NAME if safe_serialization else WEIGHTS_NAME
        filename_pattern = SAFE_WEIGHTS_PATTERN_NAME if safe_serialization else WEIGHTS_PATTERN_NAME

        from huggingface_hub import split_torch_state_dict_into_shards

        state_dict_split = split_torch_state_dict_into_shards(
            state_dict, filename_pattern=filename_pattern, max_shard_size=max_shard_size
        )

        # Clean the folder from a previous save
        for filename in os.listdir(save_directory):
            full_filename = os.path.join(save_directory, filename)
            # If we have a shard file that is not going to be replaced, we delete it, but only from the main process
            # in distributed settings to avoid race conditions.
            weights_no_suffix = weights_name.replace(".bin", "")

            # make sure that file to be deleted matches format of sharded file, e.g. pytorch_model-00001-of-00005
            filename_no_suffix = filename.replace(".bin", "")
            reg = re.compile(r"(.*?)-\d{5}-of-\d{5}")

            if (
                filename.startswith(weights_no_suffix)
                and os.path.isfile(full_filename)
                and filename not in state_dict_split.filename_to_tensors.keys()
                and reg.fullmatch(filename_no_suffix) is not None
                and PartialState().is_main_process
            ):
                os.remove(full_filename)

        # Save the model
        for filename, tensors in state_dict_split.filename_to_tensors.items():
            shard = {tensor: state_dict[tensor] for tensor in tensors}
            self.save(shard, os.path.join(save_directory, filename), safe_serialization=safe_serialization)

        # Save index if sharded
        if state_dict_split.is_sharded:
            index = {
                "metadata": state_dict_split.metadata,
                "weight_map": state_dict_split.tensor_to_filename,
            }
            save_index_file = SAFE_WEIGHTS_INDEX_NAME if safe_serialization else WEIGHTS_INDEX_NAME
            save_index_file = os.path.join(save_directory, save_index_file)
            with open(save_index_file, "w", encoding="utf-8") as f:
                content = json.dumps(index, indent=2, sort_keys=True) + "\n"
                f.write(content)
            logger.info(
                f"The model is bigger than the maximum size per checkpoint ({max_shard_size}) and is going to be "
                f"split in {len(state_dict_split.filename_to_tensors)} checkpoint shards. You can find where each parameters has been saved in the "
                f"index located at {save_index_file}."
            )
        else:
            path_to_weights = os.path.join(save_directory, WEIGHTS_NAME)
            logger.info(f"Model weights saved in {path_to_weights}")

    def register_save_state_pre_hook(self, hook: Callable[..., None]) -> hooks.RemovableHandle:
        """
        Registers a pre hook to be run before `save_checkpoint` is called in [`Accelerator.save_state`].

        Args:
            hook (`Callable`):
                A function to be called in [`Accelerator.save_state`] before `save_checkpoint`.

        The hook should have the following signature:

        `hook(models: list[torch.nn.Module], weights: list[dict[str, torch.Tensor]], input_dir: str) -> None`

        The `models` argument are the models as saved in the accelerator state under `accelerator._models`, `weights`
        argument are the state dicts of the `models`, and the `input_dir` argument is the `input_dir` argument passed
        to [`Accelerator.load_state`].

        <Tip>

        Should only be used in conjunction with [`Accelerator.register_load_state_pre_hook`]. Can be useful to save
        configurations in addition to model weights. Can also be used to overwrite model saving with a customized
        method. In this case, make sure to remove already loaded weights from the weights list.

        </Tip>

        Returns:
            `torch.utils.hooks.RemovableHandle`: a handle that can be used to remove the added hook by calling
            `handle.remove()`
        """
        handle = hooks.RemovableHandle(self._save_model_state_pre_hook)
        self._save_model_state_pre_hook[handle.id] = hook
        return handle

    def save_state(self, output_dir: str | None = None, safe_serialization: bool = True, **save_model_func_kwargs):
        """
        Saves the current states of the model, optimizer, scaler, RNG generators, and registered objects to a folder.

        If a `ProjectConfiguration` was passed to the `Accelerator` object with `automatic_checkpoint_naming` enabled
        then checkpoints will be saved to `self.project_dir/checkpoints`. If the number of current saves is greater
        than `total_limit` then the oldest save is deleted. Each checkpoint is saved in separate folders named
        `checkpoint_<iteration>`.

        Otherwise they are just saved to `output_dir`.

        <Tip>

        Should only be used when wanting to save a checkpoint during training and restoring the state in the same
        environment.

        </Tip>

        Args:
            output_dir (`str` or `os.PathLike`):
                The name of the folder to save all relevant weights and states.
            safe_serialization (`bool`, *optional*, defaults to `True`):
                Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
            save_model_func_kwargs (`dict`, *optional*):
                Additional keyword arguments for saving model which can be passed to the underlying save function, such
                as optional arguments for DeepSpeed's `save_checkpoint` function.

        Example:

        ```python
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator()
        >>> model, optimizer, lr_scheduler = ...
        >>> model, optimizer, lr_scheduler = accelerator.prepare(model, optimizer, lr_scheduler)
        >>> accelerator.save_state(output_dir="my_checkpoint")
        ```
        """
        if self.project_configuration.automatic_checkpoint_naming:
            output_dir = os.path.join(self.project_dir, "checkpoints")
        os.makedirs(output_dir, exist_ok=True)
        if self.project_configuration.automatic_checkpoint_naming:
            folders = [os.path.join(output_dir, folder) for folder in os.listdir(output_dir)]
            if (
                self.project_configuration.total_limit is not None
                and (len(folders) + 1 > self.project_configuration.total_limit)
                and self.is_main_process
            ):

                def _inner(folder):
                    return list(map(int, re.findall(r"[\/]?([0-9]+)(?=[^\/]*$)", folder)))[0]

                folders.sort(key=_inner)
                logger.warning(
                    f"Deleting {len(folders) + 1 - self.project_configuration.total_limit} checkpoints to make room for new checkpoint."
                )
                for folder in folders[: len(folders) + 1 - self.project_configuration.total_limit]:
                    shutil.rmtree(folder)
            output_dir = os.path.join(output_dir, f"checkpoint_{self.save_iteration}")
            if os.path.exists(output_dir):
                raise ValueError(
                    f"Checkpoint directory {output_dir} ({self.save_iteration}) already exists. Please manually override `self.save_iteration` with what iteration to start with."
                )
            self.wait_for_everyone()
        os.makedirs(output_dir, exist_ok=True)
        logger.info(f"Saving current state to {output_dir}")

        if self.distributed_type == DistributedType.XLA:
            # Finish running the previous step before checkpointing
            xm.mark_step()

        # Save the models taking care of FSDP and DeepSpeed nuances
        weights = []
        for i, model in enumerate(self._models):
            if self.distributed_type == DistributedType.FSDP:
                logger.info("Saving FSDP model")
                save_fsdp_model(self.state.fsdp_plugin, self, model, output_dir, i)
                logger.info(f"FSDP Model saved to output dir {output_dir}")
            elif self.distributed_type == DistributedType.DEEPSPEED:
                logger.info("Saving DeepSpeed Model and Optimizer")
                ckpt_id = f"{MODEL_NAME}" if i == 0 else f"{MODEL_NAME}_{i}"
                model.save_checkpoint(output_dir, ckpt_id, **save_model_func_kwargs)
                logger.info(f"DeepSpeed Model and Optimizer saved to output dir {os.path.join(output_dir, ckpt_id)}")
            elif self.distributed_type == DistributedType.MEGATRON_LM:
                logger.info("Saving Megatron-LM Model, Optimizer and Scheduler")
                model.save_checkpoint(output_dir)
                logger.info(f"Megatron-LM Model , Optimizer and Scheduler saved to output dir {output_dir}")
            else:
                weights.append(self.get_state_dict(model, unwrap=False))

        # Save the optimizers taking care of FSDP and DeepSpeed nuances
        optimizers = []
        if self.distributed_type == DistributedType.FSDP:
            for i, opt in enumerate(self._optimizers):
                logger.info("Saving FSDP Optimizer")
                save_fsdp_optimizer(self.state.fsdp_plugin, self, opt, self._models[i], output_dir, i)
                logger.info(f"FSDP Optimizer saved to output dir {output_dir}")
        elif self.distributed_type not in [DistributedType.DEEPSPEED, DistributedType.MEGATRON_LM]:
            optimizers = self._optimizers

        # Save the lr schedulers taking care of DeepSpeed nuances
        schedulers = []
        if self.distributed_type == DistributedType.DEEPSPEED:
            for i, scheduler in enumerate(self._schedulers):
                if isinstance(scheduler, DeepSpeedSchedulerWrapper):
                    continue
                schedulers.append(scheduler)
        elif self.distributed_type not in [DistributedType.MEGATRON_LM]:
            schedulers = self._schedulers

        # Save the samplers of the dataloaders
        dataloaders = self._dataloaders

        # Call model loading hooks that might have been registered with
        # accelerator.register_model_state_hook
        for hook in self._save_model_state_pre_hook.values():
            hook(self._models, weights, output_dir)

        save_location = save_accelerator_state(
            output_dir,
            weights,
            optimizers,
            schedulers,
            dataloaders,
            self.state.process_index,
            self.step,
            self.scaler,
            save_on_each_node=self.project_configuration.save_on_each_node,
            safe_serialization=safe_serialization,
        )
        for i, obj in enumerate(self._custom_objects):
            save_custom_state(obj, output_dir, i, save_on_each_node=self.project_configuration.save_on_each_node)
        self.project_configuration.iteration += 1
        return save_location

    def register_load_state_pre_hook(self, hook: Callable[..., None]) -> hooks.RemovableHandle:
        """
        Registers a pre hook to be run before [`load_checkpoint`] is called in [`Accelerator.load_state`].

        Args:
            hook (`Callable`):
                A function to be called in [`Accelerator.load_state`] before `load_checkpoint`.

        The hook should have the following signature:

        `hook(models: list[torch.nn.Module], input_dir: str) -> None`

        The `models` argument are the models as saved in the accelerator state under `accelerator._models`, and the
        `input_dir` argument is the `input_dir` argument passed to [`Accelerator.load_state`].

        <Tip>

        Should only be used in conjunction with [`Accelerator.register_save_state_pre_hook`]. Can be useful to load
        configurations in addition to model weights. Can also be used to overwrite model loading with a customized
        method. In this case, make sure to remove already loaded models from the models list.

        </Tip>

        Returns:
            `torch.utils.hooks.RemovableHandle`: a handle that can be used to remove the added hook by calling
            `handle.remove()`
        """
        handle = hooks.RemovableHandle(self._load_model_state_pre_hook)
        self._load_model_state_pre_hook[handle.id] = hook
        return handle

    def load_state(self, input_dir: str | None = None, load_kwargs: dict | None = None, **load_model_func_kwargs):
        """
        Loads the current states of the model, optimizer, scaler, RNG generators, and registered objects.

        <Tip>

        Should only be used in conjunction with [`Accelerator.save_state`]. If a file is not registered for
        checkpointing, it will not be loaded if stored in the directory.

        </Tip>

        Args:
            input_dir (`str` or `os.PathLike`):
                The name of the folder all relevant weights and states were saved in. Can be `None` if
                `automatic_checkpoint_naming` is used, and will pick up from the latest checkpoint.
            load_kwargs (`dict`, *optional*):
                Additional keyword arguments for the underlying `load` function, such as optional arguments for
                state_dict and optimizer on.
            load_model_func_kwargs (`dict`, *optional*):
                Additional keyword arguments for loading model which can be passed to the underlying load function,
                such as optional arguments for DeepSpeed's `load_checkpoint` function or a `map_location` to load the
                model and optimizer on.

        Example:

        ```python
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator()
        >>> model, optimizer, lr_scheduler = ...
        >>> model, optimizer, lr_scheduler = accelerator.prepare(model, optimizer, lr_scheduler)
        >>> accelerator.load_state("my_checkpoint")
        ```
        """
        if input_dir is not None:
            # Check if folder exists
            input_dir = os.path.expanduser(input_dir)
            if not os.path.isdir(input_dir):
                raise ValueError(f"Tried to find {input_dir} but folder does not exist")
        elif self.project_configuration.automatic_checkpoint_naming:
            # Pick up from automatic checkpoint naming
            input_dir = os.path.join(self.project_dir, "checkpoints")
            folders = [os.path.join(input_dir, folder) for folder in os.listdir(input_dir)]

            def _inner(folder):
                return list(map(int, re.findall(r"[\/]?([0-9]+)(?=[^\/]*$)", folder)))[0]

            folders.sort(key=_inner)
            input_dir = folders[-1]
        else:
            raise ValueError("No input_dir provided and automatic checkpoint naming is disabled.")
        logger.info(f"Loading states from {input_dir}")

        # Load the models taking care of FSDP and DeepSpeed nuances
        models = []
        for i, model in enumerate(self._models):
            if self.distributed_type == DistributedType.FSDP:
                logger.info("Loading FSDP model")
                load_fsdp_model(self.state.fsdp_plugin, self, model, input_dir, i)
                logger.info(f"FSDP Model loaded from input dir {input_dir}")
            elif self.distributed_type == DistributedType.DEEPSPEED:
                logger.info("Loading DeepSpeed Model and Optimizer")
                ckpt_id = f"{MODEL_NAME}" if i == 0 else f"{MODEL_NAME}_{i}"
                model.load_checkpoint(input_dir, ckpt_id, **load_model_func_kwargs)
                logger.info(f"DeepSpeed Model and Optimizer loaded from input dir {os.path.join(input_dir, ckpt_id)}")
            elif self.distributed_type == DistributedType.MEGATRON_LM:
                logger.info("Loading Megatron-LM Model, Optimizer and Scheduler")
                model.load_checkpoint(input_dir)
                logger.info(f"Megatron-LM Model , Optimizer and Scheduler loaded from input dir {input_dir}")
            else:
                models.append(model)

        # We need to load the scaler state before the optimizer for FSDP2
        # (`torch.distributed.checkpoint.set_optimizer_state_dict`) which we use to set the state of the optimizer calls `optimizer.step` on
        # a dummy tensor, but since the scaler is not initialized, it will raise an error (the scaler exists but its `_scale` is None)
        scaler = None
        if self.scaler is not None and self.is_fsdp2:
            input_scaler_file = os.path.join(input_dir, SCALER_NAME)
            scaler_state = torch.load(input_scaler_file)
            self.scaler.load_state_dict(scaler_state)
            # We also need to call the `_lazy_init_scale_growth_tracker` to initialize the scaler, as it would else be called
            # on the first call to scale
            self.scaler._lazy_init_scale_growth_tracker(self.scaler._device)
            logger.info("GradScaler state loaded successfully")
        else:
            scaler = self.scaler

        # Load the optimizers taking care of FSDP and DeepSpeed nuances
        optimizers = []
        if self.distributed_type == DistributedType.FSDP:
            for i, opt in enumerate(self._optimizers):
                logger.info("Loading FSDP Optimizer")
                load_fsdp_optimizer(self.state.fsdp_plugin, self, opt, self._models[i], input_dir, i)
                logger.info(f"FSDP Optimizer loaded from input dir {input_dir}")
        elif self.distributed_type not in [DistributedType.DEEPSPEED, DistributedType.MEGATRON_LM]:
            optimizers = self._optimizers

        # Load the lr schedulers taking care of DeepSpeed nuances
        schedulers = []
        if self.distributed_type == DistributedType.DEEPSPEED:
            for i, scheduler in enumerate(self._schedulers):
                if isinstance(scheduler, DeepSpeedSchedulerWrapper):
                    continue
                schedulers.append(scheduler)
        elif self.distributed_type not in [DistributedType.MEGATRON_LM]:
            schedulers = self._schedulers

        dataloaders = self._dataloaders

        # Call model loading hooks that might have been registered with
        # accelerator.register_model_state_hook
        for hook in self._load_model_state_pre_hook.values():
            hook(models, input_dir)

        map_location = load_model_func_kwargs.pop("map_location", None)
        if map_location is None:
            if self.num_processes > 1 and self.multi_device and self.distributed_type != DistributedType.MULTI_XPU:
                map_location = "on_device"
            else:
                map_location = "cpu"

        override_attributes = load_accelerator_state(
            input_dir,
            models,
            optimizers,
            schedulers,
            dataloaders,
            self.state.process_index,
            scaler,
            map_location,
            load_kwargs,
            **load_model_func_kwargs,
        )
        if "step" in override_attributes:
            self.step = override_attributes["step"]
        custom_checkpoints = [
            f for f in os.listdir(input_dir) if re.search(r"^custom_checkpoint_\d+\.pkl$", f) is not None
        ]
        if len(custom_checkpoints) != len(self._custom_objects):
            err = (
                f"Number of custom checkpoints in folder {input_dir} does not match the number of registered objects:"
            )
            err += f"\n\tFound checkpoints: {len(custom_checkpoints)}"
            err += f"\n\tRegistered objects: {len(self._custom_objects)}\n"
            err += "Please make sure to only load checkpoints from folders that were created with the same set of registered objects,"
            err += "or avoid using `custom_checkpoint` in the filename for files in that same directory and load them in manually."
            raise RuntimeError(err)
        else:
            logger.info(f"Loading in {len(custom_checkpoints)} custom states")
            for index, obj in enumerate(self._custom_objects):
                load_custom_state(obj, input_dir, index)

    def free_memory(self, *objects):
        """
        Will release all references to the internal objects stored and call the garbage collector. You should call this
        method between two trainings with different models/optimizers. Also will reset `Accelerator.step` to 0.

        Example:

        ```python
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator()
        >>> model, optimizer, scheduler = ...
        >>> model, optimizer, scheduler = accelerator.prepare(model, optimizer, scheduler)
        >>> model, optimizer, scheduler = accelerator.free_memory(model, optimizer, scheduler)
        ```
        """
        # Deepspeed needs a bit more prep that should be done first
        if hasattr(self, "deepspeed_engine_wrapped"):
            if self.deepspeed_engine_wrapped is not None:
                self.deepspeed_engine_wrapped.engine.destroy()
            self.deepspeed_engine_wrapped = None
        objects = release_memory(*objects)
        self._schedulers = []
        self._optimizers = []
        self._models = []
        self._dataloaders = []
        self.step = 0
        return objects

    def clear(self, *objects):
        """
        Alias for [`Accelerate.free_memory`], releases all references to the internal objects stored and call the
        garbage collector. You should call this method between two trainings with different models/optimizers.

        Example:

        ```python
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator()
        >>> model, optimizer, scheduler = ...
        >>> model, optimizer, scheduler = accelerator.prepare(model, optimizer, scheduler)
        >>> model, optimizer, scheduler = accelerator.clear(model, optimizer, scheduler)
        ```
        """
        return self.free_memory(*objects)

    def _get_named_parameters(self, *args, drop_refs=False):
        named_parameters = {}
        accessor_mapping = {}
        for obj in args:
            if isinstance(obj, torch.nn.Module):
                obj = extract_model_from_parallel(obj)
                if not drop_refs:
                    named_parameters.update({n: p for n, p in obj.named_parameters()})
                    continue

                # we need this bit as `WeightWithDynamic...` returns 0 when `data_ptr()` is called,
                # the underlying pointer is actually hidden in `_tensor` attribute
                if self.fp8_backend == FP8BackendType.AO:
                    from torchao.float8.fsdp_utils import WeightWithDynamicFloat8CastTensor

                    accessor_mapping[WeightWithDynamicFloat8CastTensor] = "_tensor"
                _torch_distributed_available = torch.distributed.is_available()
                _is_dtensor_available = _torch_distributed_available and is_torch_version(
                    ">=", DTENSOR_PYTORCH_VERSION
                )
                # we know we're in FSDP2 so DTensor is available
                if _is_dtensor_available:
                    from torch.distributed.tensor import DTensor

                    accessor_mapping[DTensor] = "_local_tensor"

                named_parameters.update(
                    {
                        n: getattr(p, accessor_mapping[type(p)]).data_ptr()
                        if type(p) in accessor_mapping
                        else p.data_ptr()
                        for n, p in obj.named_parameters()
                    }
                )
        return named_parameters

    def _get_devices(self, *args):
        model_device = None
        optimizer_device = None
        for obj in args:
            # Loop through model parameters and stop at the first once we have its device.
            if isinstance(obj, torch.nn.Module):
                for param in obj.parameters():
                    model_device = param.device
                    break
            # Loop through optimizer parameters groups and stop at the first once we have its device.
            if isinstance(obj, torch.optim.Optimizer):
                for param_group in obj.param_groups:
                    if len(param_group["params"]) > 0:
                        optimizer_device = param_group["params"][0].device
                        break
        return (model_device, optimizer_device)

    def get_state_dict(self, model, unwrap=True):
        """
        Returns the state dictionary of a model sent through [`Accelerator.prepare`] potentially without full
        precision.

        Args:
            model (`torch.nn.Module`):
                A PyTorch model sent through [`Accelerator.prepare`]
            unwrap (`bool`, *optional*, defaults to `True`):
                Whether to return the original underlying state_dict of `model` or to return the wrapped state_dict

        Returns:
            `dict`: The state dictionary of the model potentially without full precision.

        Example:

        ```python
        >>> import torch
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator()
        >>> net = torch.nn.Linear(2, 2)
        >>> net = accelerator.prepare(net)
        >>> state_dict = accelerator.get_state_dict(net)
        ```
        """

        if self.distributed_type == DistributedType.DEEPSPEED:
            zero3_sharding = self.deepspeed_config["zero_optimization"]["stage"] == 3
            tp_sharding = self.deepspeed_config.get("tensor_parallel", {}).get("autotp_size", 0) > 1
            if zero3_sharding or tp_sharding:
                if model.zero_gather_16bit_weights_on_model_save():
                    ver_min_required = "0.16.4"
                    if tp_sharding and not compare_versions("deepspeed", ">=", ver_min_required):
                        raise ImportError(
                            f"Deepspeed TP requires deepspeed>={ver_min_required}. Please update DeepSpeed via `pip install deepspeed -U`."
                        )
                    state_dict = (
                        model._consolidated_16bit_state_dict()
                        if tp_sharding
                        else model._zero3_consolidated_16bit_state_dict()
                    )
                else:
                    raise ValueError(
                        "Cannot get 16bit model weights because `stage3_gather_16bit_weights_on_model_save` in DeepSpeed config is False. "
                        "To save the model weights in 16bit, set `stage3_gather_16bit_weights_on_model_save` to True in DeepSpeed config file or "
                        "set `zero3_save_16bit_model` to True when using `accelerate config`. "
                        "To save the full checkpoint, run `model.save_checkpoint(save_dir)` and use `zero_to_fp32.py` to recover weights."
                    )
            else:
                from deepspeed.checkpoint.utils import clone_tensors_for_torch_save

                state_dict = clone_tensors_for_torch_save(self.unwrap_model(model).state_dict())
        elif self.is_fsdp2:
            from torch.distributed.checkpoint.state_dict import StateDictOptions, get_model_state_dict

            options = StateDictOptions(full_state_dict=True, broadcast_from_rank0=True, cpu_offload=True)
            state_dict = get_model_state_dict(model, options=options)
        elif self.distributed_type == DistributedType.FSDP:
            from torch.distributed.fsdp import FullStateDictConfig, StateDictType
            from torch.distributed.fsdp import FullyShardedDataParallel as FSDP

            full_state_dict_config = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
            with FSDP.state_dict_type(model, StateDictType.FULL_STATE_DICT, full_state_dict_config):
                state_dict = model.state_dict()
        else:
            if unwrap:
                model = self.unwrap_model(model)
            state_dict = model.state_dict()

        return state_dict

    def register_for_checkpointing(self, *objects):
        """
        Makes note of `objects` and will save or load them in during `save_state` or `load_state`.

        These should be utilized when the state is being loaded or saved in the same script. It is not designed to be
        used in different scripts.

        <Tip>

        Every `object` must have a `load_state_dict` and `state_dict` function to be stored.

        </Tip>

        Example:

        ```python
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator()
        >>> # Assume `CustomObject` has a `state_dict` and `load_state_dict` function.
        >>> obj = CustomObject()
        >>> accelerator.register_for_checkpointing(obj)
        >>> accelerator.save_state("checkpoint.pt")
        ```
        """
        invalid_objects = []
        for obj in objects:
            if not hasattr(obj, "state_dict") or not hasattr(obj, "load_state_dict"):
                invalid_objects.append(obj)
        if len(invalid_objects) > 0:
            err = "All `objects` must include a `state_dict` and `load_state_dict` function to be stored. The following inputs are invalid:"
            for index, obj in enumerate(invalid_objects):
                err += f"\n\t- Item at index {index}, `{get_pretty_name(obj)}`"
            raise ValueError(err)
        self._custom_objects.extend(objects)

    @contextmanager
    def maybe_context_parallel(
        self,
        buffers: list[torch.Tensor] | None = None,
        buffer_seq_dims: list[int] | None = None,
        no_restore_buffers: set[torch.Tensor] | None = None,
    ):
        """
        A context manager that enables context parallel training.

        Args:
            buffers (`list[torch.Tensor]`, `optional`):
                Buffers, which are going to be sharded along the sequence dimension. Common examples are inputs, labels
                or positional embedding buffers. This context manager will modify these buffers in-place, and after
                exiting the context, the buffers will be restored to their original state. To avoid unnecessary
                restores, you can use `no_restore_buffers` to specify which buffers don't need to be restored.
            buffer_seq_dims (`list[int]`, `optional`):
                Sequence dimensions of `buffers`.
            no_restore_buffers (`set[torch.Tensor]`, `optional`):
                This set must be a subset of `buffers`. Specifies which buffers from `buffers` argument won't be
                restored after the context exits. These buffers will be then kept in sharded state.

        <Tip warning={true}>

        `context_parallel` is currently supported with FSDP2 and requires `parallelism_config.cp_size` >
        1. If either of these conditions are not met, this context manager will have no effect, though to enable fewer
        code changes it will not raise an Exception.

        </Tip>

        <Tip warning={true}>

        This context manager has to be recreated with each training step, as shown in the example below.

        </Tip>

        Example:

        ```python
        >>> for batch in dataloader:
        ...     with accelerator.maybe_context_parallel(
        ...         buffers=[batch["input_ids"], batch["attention_mask"]],
        ...         buffer_seq_dims=[1, 1],
        ...         no_restore_buffers={batch["input_ids"]},
        ...     ):
        ...         outputs = model(batch)
        ...         ...
        ```
        """
        # We don't need to check FSDP2 as parallelism_config does that for us
        # Invariant: in this branch self._cp_context is set, as it was set by `self._prepare_cp`
        if (
            self.parallelism_config
            and self.parallelism_config.cp_backend == "torch"
            and self.parallelism_config.cp_enabled
        ):
            with self._cp_context(
                buffers=buffers, buffer_seq_dims=buffer_seq_dims, no_restore_buffers=no_restore_buffers
            ):
                yield
        else:
            logger.warning_once(
                "Context parallel training is not enabled. This context manager will have no effect. "
                "To enable it, set `parallelism_config.cp_size` > 1 in the `Accelerator` constructor."
            )
            yield

    @contextmanager
    def autocast(self, autocast_handler: AutocastKwargs = None):
        """
        Will apply automatic mixed-precision inside the block inside this context manager, if it is enabled. Nothing
        different will happen otherwise.

        A different `autocast_handler` can be passed in to override the one set in the `Accelerator` object. This is
        useful in blocks under `autocast` where you want to revert to fp32.

        Example:

        ```python
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator(mixed_precision="fp16")
        >>> with accelerator.autocast():
        ...     train()
        ```
        """
        if autocast_handler is None:
            autocast_handler = self.autocast_handler
        autocast_context = get_mixed_precision_context_manager(self.native_amp, autocast_handler)
        with autocast_context:
            yield

    @contextmanager
    def profile(self, profile_handler: ProfileKwargs | None = None):
        """
        Will profile the code inside the context manager. The profile will be saved to a Chrome Trace file if
        `profile_handler.output_trace_dir` is set.

        A different `profile_handler` can be passed in to override the one set in the `Accelerator` object.

        Args:
            profile_handler (`ProfileKwargs`, *optional*):
                The profile handler to use for this context manager. If not passed, will use the one set in the
                `Accelerator` object.

        Example:

        ```python
        # Profile with default settings
        from accelerate import Accelerator
        from accelerate.utils import ProfileKwargs

        accelerator = Accelerator()
        with accelerator.profile() as prof:
            train()
        accelerator.print(prof.key_averages().table())


        # Profile with the custom handler
        def custom_handler(prof):
            print(prof.key_averages().table(sort_by="self_cpu_time_total", row_limit=10))


        kwargs = ProfileKwargs(schedule_option=dict(wait=1, warmup=1, active=1), on_trace_ready=custom_handler)
        accelerator = Accelerator(kwarg_handler=[kwargs])
        with accelerator.profile() as prof:
            for _ in range(10):
                train_iteration()
                prof.step()


        # Profile and export to Chrome Trace
        kwargs = ProfileKwargs(output_trace_dir="output_trace")
        accelerator = Accelerator(kwarg_handler=[kwargs])
        with accelerator.profile():
            train()
        ```
        """
        profile_handler = profile_handler or self.profile_handler or ProfileKwargs()

        with profile_handler.build() as profiler:
            yield profiler

        if profile_handler.output_trace_dir is None:
            return

        os.makedirs(profile_handler.output_trace_dir, exist_ok=True)
        profiler.export_chrome_trace(
            os.path.join(profile_handler.output_trace_dir, PROFILE_PATTERN_NAME.format(suffix=self.process_index))
        )
        self.wait_for_everyone()

    @property
    def optimizer_step_was_skipped(self):
        """
        Whether or not the optimizer update was skipped (because of gradient overflow in mixed precision), in which
        case the learning rate should not be changed.
        """
        for optimizer in self._optimizers:
            if optimizer.step_was_skipped:
                return True
        return False

    def skip_first_batches(self, dataloader, num_batches: int = 0):
        """
        Creates a new `torch.utils.data.DataLoader` that will efficiently skip the first `num_batches`.

        Args:
            dataloader (`torch.utils.data.DataLoader`): The data loader in which to skip batches.
            num_batches (`int`, *optional*, defaults to 0): The number of batches to skip

        Example:

        ```python
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator()
        >>> dataloader, model, optimizer, scheduler = accelerator.prepare(dataloader, model, optimizer, scheduler)
        >>> skipped_dataloader = accelerator.skip_first_batches(dataloader, num_batches=2)
        >>> # for the first epoch only
        >>> for input, target in skipped_dataloader:
        ...     optimizer.zero_grad()
        ...     output = model(input)
        ...     loss = loss_func(output, target)
        ...     accelerator.backward(loss)
        ...     optimizer.step()

        >>> # subsequent epochs
        >>> for input, target in dataloader:
        ...     optimizer.zero_grad()
        ...     ...
        ```
        """
        return skip_first_batches(dataloader, num_batches=num_batches)

    def __deepcopy__(self, memo):
        logger.info("Deep copying the `Accelerator` object, note that this will point to the same original object.")
        return self

    def verify_device_map(self, model: torch.nn.Module) -> bool:
        """
        Verifies that `model` has not been prepared with big model inference with a device-map resembling `auto`.
        """
        # Checks if any of the child modules has the attribute `hf_device_map` and this map has more than one entry.
        for m in model.modules():
            if hasattr(m, "hf_device_map") and len(m.hf_device_map) > 1:
                return True

        return False

    def lomo_backward(self, loss: torch.Tensor, learning_rate: float) -> None:
        """
        Runs backward pass on LOMO optimizers.
        """
        if is_lomo_available():
            # We need to import locally to avoid circular imports since lomo imports stuff from
            # transformers & accelerate
            from lomo_optim import AdaLomo, Lomo

        if learning_rate is None:
            raise ValueError("A learning rate must be passed in order to call backward pass with LOMO optimizers.")

        _backward_called = False

        for optimizer in self._optimizers:
            if isinstance(optimizer.optimizer, (Lomo, AdaLomo)):
                optimizer.optimizer.fused_backward(loss, learning_rate)
                _backward_called = True

        if not _backward_called:
            raise ValueError(
                "Backward pass not properly called on LOMO optimizers. Are you sure you passed a LOMO optimizer in accelerator.prepare()?"
            )

    @property
    def fp8_backend(self) -> FP8BackendType:
        "Returns the configured backend for training in FP8"
        if self.has_fp8_handler:
            if self.fp8_recipe_handler is not None:
                return FP8BackendType(self.fp8_recipe_handler.backend)
            elif self.ao_recipe_handler is not None:
                return FP8BackendType.AO
            elif self.te_recipe_handler is not None:
                return FP8BackendType.TE
            elif self.msamp_recipe_handler is not None:
                return FP8BackendType.MSAMP
        elif self.state.deepspeed_plugin is not None and self.state.deepspeed_plugin.enable_msamp:
            return FP8BackendType.MSAMP

        return FP8BackendType(parse_choice_from_env("ACCELERATE_FP8_BACKEND", "NO"))


================================================
FILE: src/accelerate/big_modeling.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import os
import re
from contextlib import contextmanager
from functools import wraps
from typing import Optional, Union

import torch
import torch.nn as nn

from .hooks import (
    AlignDevicesHook,
    CpuOffload,
    LayerwiseCastingHook,
    UserCpuOffloadHook,
    add_hook_to_module,
    attach_align_device_hook,
    attach_align_device_hook_on_blocks,
)
from .utils import (
    OffloadedWeightsLoader,
    check_cuda_p2p_ib_support,
    check_device_map,
    extract_submodules_state_dict,
    find_tied_parameters,
    get_balanced_memory,
    infer_auto_device_map,
    is_bnb_available,
    is_mlu_available,
    is_musa_available,
    is_neuron_available,
    is_npu_available,
    is_sdaa_available,
    is_xpu_available,
    load_checkpoint_in_model,
    offload_state_dict,
    parse_flag_from_env,
    retie_parameters,
)
from .utils.constants import SUPPORTED_PYTORCH_LAYERS_FOR_UPCASTING
from .utils.other import recursive_getattr


logger = logging.getLogger(__name__)


@contextmanager
def init_empty_weights(include_buffers: Optional[bool] = None):
    """
    A context manager under which models are initialized with all parameters on the meta device, therefore creating an
    empty model. Useful when just initializing the model would blow the available RAM.

    Args:
        include_buffers (`bool`, *optional*):
            Whether or not to also put all buffers on the meta device while initializing.

    Example:

    ```python
    import torch.nn as nn
    from accelerate import init_empty_weights

    # Initialize a model with 100 billions parameters in no time and without using any RAM.
    with init_empty_weights():
        tst = nn.Sequential(*[nn.Linear(10000, 10000) for _ in range(1000)])
    ```

    <Tip warning={true}>

    Any model created under this context manager has no weights. As such you can't do something like
    `model.to(some_device)` with it. To load weights inside your empty model, see [`load_checkpoint_and_dispatch`].
    Make sure to overwrite the default device_map param for [`load_checkpoint_and_dispatch`], otherwise dispatch is not
    called.

    </Tip>
    """
    if include_buffers is None:
        include_buffers = parse_flag_from_env("ACCELERATE_INIT_INCLUDE_BUFFERS", False)
    with init_on_device(torch.device("meta"), include_buffers=include_buffers) as f:
        yield f


@contextmanager
def init_on_device(device: torch.device, include_buffers: Optional[bool] = None):
    """
    A context manager under which models are initialized with all parameters on the specified device.

    Args:
        device (`torch.device`):
            Device to initialize all parameters on.
        include_buffers (`bool`, *optional*):
            Whether or not to also put all buffers on the meta device while initializing.

    Example:

    ```python
    import torch.nn as nn
    from accelerate import init_on_device

    # init model on specified device(e.g., "cuda", "xpu" and so on)
    with init_on_device(device=torch.device("cuda")):
        tst = nn.Linear(100, 100)  # on specified device
    ```
    """
    if include_buffers is None:
        include_buffers = parse_flag_from_env("ACCELERATE_INIT_INCLUDE_BUFFERS", False)

    if include_buffers:
        with device:
            yield
        return

    old_register_parameter = nn.Module.register_parameter
    if include_buffers:
        old_register_buffer = nn.Module.register_buffer

    def register_empty_parameter(module, name, param):
        old_register_parameter(module, name, param)
        if param is not None:
            param_cls = type(module._parameters[name])
            kwargs = module._parameters[name].__dict__
            kwargs["requires_grad"] = param.requires_grad
            # Pop non-constructor attributes before creating the parameter, then restore them after
            _is_hf_initialized = kwargs.pop("_is_hf_initialized", None)
            module._parameters[name] = param_cls(module._parameters[name].to(device), **kwargs)
            if _is_hf_initialized is not None:
                module._parameters[name]._is_hf_initialized = _is_hf_initialized

    def register_empty_buffer(module, name, buffer, persistent=True):
        old_register_buffer(module, name, buffer, persistent=persistent)
        if buffer is not None:
            module._buffers[name] = module._buffers[name].to(device)

    # Patch tensor creation
    if include_buffers:
        tensor_constructors_to_patch = {
            torch_function_name: getattr(torch, torch_function_name)
            for torch_function_name in ["empty", "zeros", "ones", "full"]
        }
    else:
        tensor_constructors_to_patch = {}

    def patch_tensor_constructor(fn):
        def wrapper(*args, **kwargs):
            kwargs["device"] = device
            return fn(*args, **kwargs)

        return wrapper

    try:
        nn.Module.register_parameter = register_empty_parameter
        if include_buffers:
            nn.Module.register_buffer = register_empty_buffer
        for torch_function_name in tensor_constructors_to_patch.keys():
            setattr(torch, torch_function_name, patch_tensor_constructor(getattr(torch, torch_function_name)))
        yield
    finally:
        nn.Module.register_parameter = old_register_parameter
        if include_buffers:
            nn.Module.register_buffer = old_register_buffer
        for torch_function_name, old_torch_function in tensor_constructors_to_patch.items():
            setattr(torch, torch_function_name, old_torch_function)


def cpu_offload(
    model: nn.Module,
    execution_device: Optional[torch.device] = None,
    offload_buffers: bool = False,
    state_dict: Optional[dict[str, torch.Tensor]] = None,
    preload_module_classes: Optional[list[str]] = None,
):
    """
    Activates full CPU offload for a model. As a result, all parameters of the model will be offloaded and only one
    copy of the state dict of the model will be kept. During the forward pass, parameters will be extracted from that
    state dict and put on the execution device passed as they are needed, then offloaded again.

    Args:
        model (`torch.nn.Module`):
            The model to offload.
        execution_device (`torch.device`, *optional*):
            The device on which the forward pass of the model will be executed (should be a GPU). Will default to the
            model first parameter device.
        offload_buffers (`bool`, *optional*, defaults to `False`):
            Whether or not to offload the buffers with the model parameters.
        state_dict (`Dict[str, torch.Tensor]`, *optional*):
            The state dict of the model that will be kept on CPU.
        preload_module_classes (`List[str]`, *optional*):
            A list of classes whose instances should load all their weights (even in the submodules) at the beginning
            of the forward. This should only be used for classes that have submodules which are registered but not
            called directly during the forward, for instance if a `dense` linear layer is registered, but at forward,
            `dense.weight` and `dense.bias` are used in some operations instead of calling `dense` directly.
    """
    if execution_device is None:
        execution_device = next(iter(model.parameters())).device
    if state_dict is None:
        state_dict = {n: p.to("cpu") for n, p in model.state_dict().items()}

    add_hook_to_module(model, AlignDevicesHook(io_same_device=True), append=True)
    attach_align_device_hook(
        model,
        execution_device=execution_device,
        offload=True,
        offload_buffers=offload_buffers,
        weights_map=state_dict,
        preload_module_classes=preload_module_classes,
    )

    return model


def cpu_offload_with_hook(
    model: torch.nn.Module,
    execution_device: Optional[Union[int, str, torch.device]] = None,
    prev_module_hook: Optional[UserCpuOffloadHook] = None,
):
    """
    Offloads a model on the CPU and puts it back to an execution device when executed. The difference with
    [`cpu_offload`] is that the model stays on the execution device after the forward and is only offloaded again when
    the `offload` method of the returned `hook` is called. Useful for pipelines running a model in a loop.

    Args:
        model (`torch.nn.Module`):
            The model to offload.
        execution_device(`str`, `int` or `torch.device`, *optional*):
            The device on which the model should be executed. Will default to the MPS device if it's available, then
            device 0 if there is an accelerator device, and finally to the CPU.
        prev_module_hook (`UserCpuOffloadHook`, *optional*):
            The hook sent back by this function for a previous model in the pipeline you are running. If passed, its
            offload method will be called just before the forward of the model to which this hook is attached.

    Example:

    ```py
    model_1, hook_1 = cpu_offload_with_hook(model_1, device)
    model_2, hook_2 = cpu_offload_with_hook(model_2, device, prev_module_hook=hook_1)
    model_3, hook_3 = cpu_offload_with_hook(model_3, device, prev_module_hook=hook_2)

    hid_1 = model_1(input)
    for i in range(50):
        # model1 is offloaded on the CPU at the first iteration, model 2 stays on the GPU for this whole loop.
        hid_2 = model_2(hid_1)
    # model2 is offloaded to the CPU just before this forward.
    hid_3 = model_3(hid_3)

    # For model3, you need to manually call the hook offload method.
    hook_3.offload()
    ```
    """
    hook = CpuOffload(execution_device=execution_device, prev_module_hook=prev_module_hook)
    add_hook_to_module(model, hook, append=True)
    user_hook = UserCpuOffloadHook(model, hook)
    return model, user_hook


def disk_offload(
    model: nn.Module,
    offload_dir: Union[str, os.PathLike],
    execution_device: Optional[torch.device] = None,
    offload_buffers: bool = False,
    preload_module_classes: Optional[list[str]] = None,
):
    """
    Activates full disk offload for a model. As a result, all parameters of the model will be offloaded as
    memory-mapped array in a given folder. During the forward pass, parameters will be accessed from that folder and
    put on the execution device passed as they are needed, then offloaded again.

    Args:
        model (`torch.nn.Module`): The model to offload.
        offload_dir (`str` or `os.PathLike`):
            The folder in which to offload the model weights (or where the model weights are already offloaded).
        execution_device (`torch.device`, *optional*):
            The device on which the forward pass of the model will be executed (should be a GPU). Will default to the
            model's first parameter device.
        offload_buffers (`bool`, *optional*, defaults to `False`):
            Whether or not to offload the buffers with the model parameters.
        preload_module_classes (`List[str]`, *optional*):
            A list of classes whose instances should load all their weights (even in the submodules) at the beginning
            of the forward. This should only be used for classes that have submodules which are registered but not
            called directly during the forward, for instance if a `dense` linear layer is registered, but at forward,
            `dense.weight` and `dense.bias` are used in some operations instead of calling `dense` directly.
    """
    if not os.path.isdir(offload_dir) or not os.path.isfile(os.path.join(offload_dir, "index.json")):
        offload_state_dict(offload_dir, model.state_dict())
    if execution_device is None:
        execution_device = next(iter(model.parameters())).device
    weights_map = OffloadedWeightsLoader(save_folder=offload_dir)

    add_hook_to_module(model, AlignDevicesHook(io_same_device=True), append=True)
    attach_align_device_hook(
        model,
        execution_device=execution_device,
        offload=True,
        offload_buffers=offload_buffers,
        weights_map=weights_map,
        preload_module_classes=preload_module_classes,
    )

    return model


def dispatch_model(
    model: nn.Module,
    device_map: dict[str, Union[str, int, torch.device]],
    main_device: Optional[torch.device] = None,
    state_dict: Optional[dict[str, torch.Tensor]] = None,
    offload_dir: Optional[Union[str, os.PathLike]] = None,
    offload_index: Optional[dict[str, str]] = None,
    offload_buffers: bool = False,
    skip_keys: Optional[Union[str, list[str]]] = None,
    preload_module_classes: Optional[list[str]] = None,
    force_hooks: bool = False,
):
    """
    Dispatches a model according to a given device map. Layers of the model might be spread across GPUs, offloaded on
    the CPU or even the disk.

    Args:
        model (`torch.nn.Module`):
            The model to dispatch.
        device_map (`Dict[str, Union[str, int, torch.device]]`):
            A dictionary mapping module names in the models `state_dict` to the device they should go to. Note that
            `"disk"` is accepted even if it's not a proper value for `torch.device`.
        main_device (`str`, `int` or `torch.device`, *optional*):
            The main execution device. Will default to the first device in the `device_map` different from `"cpu"` or
            `"disk"`.
        state_dict (`Dict[str, torch.Tensor]`, *optional*):
            The state dict of the part of the model that will be kept on CPU.
        offload_dir (`str` or `os.PathLike`):
            The folder in which to offload the model weights (or where the model weights are already offloaded).
        offload_index (`Dict`, *optional*):
            A dictionary from weight name to their information (`dtype`/ `shape` or safetensors filename). Will default
            to the index saved in `save_folder`.
        offload_buffers (`bool`, *optional*, defaults to `False`):
            Whether or not to offload the buffers with the model parameters.
        skip_keys (`str` or `List[str]`, *optional*):
            A list of keys to ignore when moving inputs or outputs between devices.
        preload_module_classes (`List[str]`, *optional*):
            A list of classes whose instances should load all their weights (even in the submodules) at the beginning
            of the forward. This should only be used for classes that have submodules which are registered but not
            called directly during the forward, for instance if a `dense` linear layer is registered, but at forward,
            `dense.weight` and `dense.bias` are used in some operations instead of calling `dense` directly.
        force_hooks (`bool`, *optional*, defaults to `False`):
            Whether or not to force device hooks to be attached to the model even if all layers are dispatched to a
            single device.
    """
    # Error early if the device map is incomplete.
    check_device_map(model, device_map)

    # We need to force hook for quantized model that can't be moved with to()
    if getattr(model, "quantization_method", "bitsandbytes") == "bitsandbytes":
        # since bnb 0.43.2, we can move 4-bit model
        if (getattr(model, "is_loaded_in_8bit", False) and not is_bnb_available(min_version="0.48.0")) or (
            getattr(model, "is_loaded_in_4bit", False) and not is_bnb_available(min_version="0.43.2")
        ):
            force_hooks = True

    # We attach hooks if the device_map has at least 2 different devices or if
    # force_hooks is set to `True`. Otherwise, the model in already loaded
    # in the unique device and the user can decide where to dispatch the model.
    # If the model is quantized, we always force-dispatch the model
    if (len(set(device_map.values())) > 1) or force_hooks:
        if main_device is None:
            if set(device_map.values()) == {"cpu"} or set(device_map.values()) == {"cpu", "disk"}:
                main_device = "cpu"
            else:
                main_device = [d for d in device_map.values() if d not in ["cpu", "disk"]][0]

        if main_device != "cpu":
            cpu_modules = [name for name, device in device_map.items() if device == "cpu"]
            if state_dict is None and len(cpu_modules) > 0:
                state_dict = extract_submodules_state_dict(model.state_dict(), cpu_modules)

        disk_modules = [name for name, device in device_map.items() if device == "disk"]
        if offload_dir is None and offload_index is None and len(disk_modules) > 0:
            raise ValueError(
                "We need an `offload_dir` to dispatch this model according to this `device_map`, the following submodules "
                f"need to be offloaded: {', '.join(disk_modules)}."
            )
        if (
            len(disk_modules) > 0
            and offload_index is None
            and (not os.path.isdir(offload_dir) or not os.path.isfile(os.path.join(offload_dir, "index.json")))
        ):
            disk_state_dict = extract_submodules_state_dict(model.state_dict(), disk_modules)
            offload_state_dict(offload_dir, disk_state_dict)

        execution_device = {
            name: main_device if device in ["cpu", "disk"] else device for name, device in device_map.items()
        }
        execution_device[""] = main_device
        offloaded_devices = ["disk"] if main_device == "cpu" or main_device == "mps" else ["cpu", "disk"]
        offload = {name: device in offloaded_devices for name, device in device_map.items()}
        save_folder = offload_dir if len(disk_modules) > 0 else None
        if state_dict is not None or save_folder is not None or offload_index is not None:
            device = main_device if offload_index is not None else None
            weights_map = OffloadedWeightsLoader(
                state_dict=state_dict, save_folder=save_folder, index=offload_index, device=device
            )
        else:
            weights_map = None

        # When dispatching the model's parameters to the devices specified in device_map, we want to avoid allocating memory several times for the
        # tied parameters. The dictionary tied_params_map keeps track of the already allocated data for a given tied parameter (represented by its
        # original pointer) on each devices.
        tied_params = find_tied_parameters(model)

        tied_params_map = {}
        for group in tied_params:
            for param_name in group:
                # data_ptr() is enough here, as `find_tied_parameters` finds tied params simply by comparing `param1 is param2`, so we don't need
                # to care about views of tensors through storage_offset.
                data_ptr = recursive_getattr(model, param_name).data_ptr()
                tied_params_map[data_ptr] = {}

                # Note: To handle the disk offloading case, we can not simply use weights_map[param_name].data_ptr() as the reference pointer,
                # as we have no guarantee that safetensors' `file.get_tensor()` will always give the same pointer.

        attach_align_device_hook_on_blocks(
            model,
            execution_device=execution_device,
            offload=offload,
            offload_buffers=offload_buffers,
            weights_map=weights_map,
            skip_keys=skip_keys,
            preload_module_classes=preload_module_classes,
            tied_params_map=tied_params_map,
        )

        # warn if there is any params on the meta device
        offloaded_devices_str = " and ".join(
            [device for device in set(device_map.values()) if device in ("cpu", "disk")]
        )
        if len(offloaded_devices_str) > 0:
            logger.warning(
                f"Some parameters are on the meta device because they were offloaded to the {offloaded_devices_str}."
            )

        # Attaching the hook may break tied weights, so we retie them
        retie_parameters(model, tied_params)

        # add warning on `to` method
        def add_warning(fn, model):
            @wraps(fn)
            def wrapper(*args, **kwargs):
                warning_msg = "You shouldn't move a model that is dispatched using accelerate hooks."
                if str(fn.__name__) == "to":
                    to_device = torch._C._nn._parse_to(*args, **kwargs)[0]
                    if to_device is not None:
                        logger.warning(warning_msg)
                else:
                    logger.warning(warning_msg)
                for param in model.parameters():
                    if param.device == torch.device("meta"):
                        raise RuntimeError("You can't move a model that has some modules offloaded to cpu or disk.")
                return fn(*args, **kwargs)

            return wrapper

        # Make sure to update _accelerate_added_attributes in hooks.py if you add any hook
        model.to = add_warning(model.to, model)
        if is_npu_available():
            model.npu = add_warning(model.npu, model)
        elif is_mlu_available():
            model.mlu = add_warning(model.mlu, model)
        elif is_sdaa_available():
            model.sdaa = add_warning(model.sdaa, model)
        elif is_musa_available():
            model.musa = add_warning(model.musa, model)
        elif is_xpu_available():
            model.xpu = add_warning(model.xpu, model)
        elif is_neuron_available():
            model.neuron = add_warning(model.neuron, model)
        else:
            model.cuda = add_warning(model.cuda, model)

        # Check if we are using multi-gpus with RTX 4000 series
        use_multi_gpu = len([device for device in set(device_map.values()) if device not in ("cpu", "disk")]) > 1
        if use_multi_gpu and not check_cuda_p2p_ib_support():
            logger.warning(
                "We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. "
                "This can affect the multi-gpu inference when using accelerate device_map."
                "Please make sure to update your driver to the latest version which resolves this."
            )
    else:
        device = list(device_map.values())[0]
        # `torch.Tensor.to(<int num>)` is not supported by `torch_npu` (see this [issue](https://github.com/Ascend/pytorch/issues/16)).
        if is_npu_available() and isinstance(device, int):
            device = f"npu:{device}"
        elif is_mlu_available() and isinstance(device, int):
            device = f"mlu:{device}"
        elif is_sdaa_available() and isinstance(device, int):
            device = f"sdaa:{device}"
        elif is_musa_available() and isinstance(device, int):
            device = f"musa:{device}"
        elif is_neuron_available() and isinstance(device, int):
            device = f"neuron:{device}"
        if device != "disk":
            model.to(device)
        else:
            raise ValueError(
                "You are trying to offload the whole model to the disk. Please use the `disk_offload` function instead."
            )
    # Convert OrderedDict back to dict for easier usage
    model.hf_device_map = dict(device_map)
    return model


def load_checkpoint_and_dispatch(
    model: nn.Module,
    checkpoint: Union[str, os.PathLike],
    device_map: Optional[Union[str, dict[str, Union[int, str, torch.device]]]] = None,
    max_memory: Optional[dict[Union[int, str], Union[int, str]]] = None,
    no_split_module_classes: Optional[list[str]] = None,
    offload_folder: Optional[Union[str, os.PathLike]] = None,
    offload_buffers: bool = False,
    dtype: Optional[Union[str, torch.dtype]] = None,
    offload_state_dict: Optional[bool] = None,
    skip_keys: Optional[Union[str, list[str]]] = None,
    preload_module_classes: Optional[list[str]] = None,
    force_hooks: bool = False,
    strict: bool = False,
    full_state_dict: bool = True,
    broadcast_from_rank0: bool = False,
):
    """
    Loads a (potentially sharded) checkpoint inside a model, potentially sending weights to a given device as they are
    loaded and adds the various hooks that will make this model run properly (even if split across devices).

    Args:
        model (`torch.nn.Module`): The model in which we want to load a checkpoint.
        checkpoint (`str` or `os.PathLike`):
            The folder checkpoint to load. It can be:
            - a path to a file containing a whole model state dict
            - a path to a `.json` file containing the index to a sharded checkpoint
            - a path to a folder containing a unique `.index.json` file and the shards of a checkpoint.
        device_map (`Dict[str, Union[int, str, torch.device]]`, *optional*):
            A map that specifies where each submodule should go. It doesn't need to be refined to each parameter/buffer
            name, once a given module name is inside, every submodule of it will be sent to the same device.

            To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For more
            information about each option see [here](../concept_guides/big_model_inference#designing-a-device-map).
            Defaults to None, which means [`dispatch_model`] will not be called.
        max_memory (`Dict`, *optional*):
            A dictionary device identifier to maximum memory. Will default to the maximum memory available for each GPU
            and the available CPU RAM if unset.
        no_split_module_classes (`List[str]`, *optional*):
            A list of layer class names that should never be split across device (for instance any layer that has a
            residual connection).
        offload_folder (`str` or `os.PathLike`, *optional*):
            If the `device_map` contains any value `"disk"`, the folder where we will offload weights.
        offload_buffers (`bool`, *optional*, defaults to `False`):
            In the layers that are offloaded on the CPU or the hard drive, whether or not to offload the buffers as
            well as the parameters.
        dtype (`str` or `torch.dtype`, *optional*):
            If provided, the weights will be converted to that type when loaded.
        offload_state_dict (`bool`, *optional*):
            If `True`, will temporarily offload the CPU state dict on the hard drive to avoid getting out of CPU RAM if
            the weight of the CPU state dict + the biggest shard does not fit. Will default to `True` if the device map
            picked contains `"disk"` values.
        skip_keys (`str` or `List[str]`, *optional*):
            A list of keys to ignore when moving inputs or outputs between devices.
        preload_module_classes (`List[str]`, *optional*):
            A list of classes whose instances should load all their weights (even in the submodules) at the beginning
            of the forward. This should only be used for classes that have submodules which are registered but not
            called directly during the forward, for instance if a `dense` linear layer is registered, but at forward,
            `dense.weight` and `dense.bias` are used in some operations instead of calling `dense` directly.
        force_hooks (`bool`, *optional*, defaults to `False`):
            Whether or not to force device hooks to be attached to the model even if all layers are dispatched to a
            single device.
        strict (`bool`, *optional*, defaults to `False`):
            Whether to strictly enforce that the keys in the checkpoint state_dict match the keys of the model's
            state_dict.
        full_state_dict (`bool`, *optional*, defaults to `True`): if this is set to `True`, all the tensors in the
            loaded state_dict will be gathered. No ShardedTensor and DTensor will be in the loaded state_dict.
        broadcast_from_rank0 (`False`, *optional*, defaults to `False`): when the option is `True`, a distributed
            `ProcessGroup` must be initialized. rank0 should receive a full state_dict and will broadcast the tensors
            in the state_dict one by one to other ranks. Other ranks will receive the tensors and shard (if applicable)
            according to the local shards in the model.

    Example:

    ```python
    >>> from accelerate import init_empty_weights, load_checkpoint_and_dispatch
    >>> from huggingface_hub import hf_hub_download
    >>> from transformers import AutoConfig, AutoModelForCausalLM

    >>> # Download the Weights
    >>> checkpoint = "EleutherAI/gpt-j-6B"
    >>> weights_location = hf_hub_download(checkpoint, "pytorch_model.bin")

    >>> # Create a model and initialize it with empty weights
    >>> config = AutoConfig.from_pretrained(checkpoint)
    >>> with init_empty_weights():
    ...     model = AutoModelForCausalLM.from_config(config)

    >>> # Load the checkpoint and dispatch it to the right devices
    >>> model = load_checkpoint_and_dispatch(
    ...     model, weights_location, device_map="auto", no_split_module_classes=["GPTJBlock"]
    ... )
    ```
    """
    if isinstance(device_map, str) and device_map not in ["auto", "balanced", "balanced_low_0", "sequential"]:
        raise ValueError(
            "If passing a string for `device_map`, please choose 'auto', 'balanced', 'balanced_low_0' or 'sequential'."
        )
    if isinstance(device_map, str):
        if device_map != "sequential":
            max_memory = get_balanced_memory(
                model,
                max_memory=max_memory,
                no_split_module_classes=no_split_module_classes,
                dtype=dtype,
                low_zero=(device_map == "balanced_low_0"),
            )
        device_map = infer_auto_device_map(
            model,
            max_memory=max_memory,
            no_split_module_classes=no_split_module_classes,
            dtype=dtype,
            offload_buffers=offload_buffers,
        )
    if offload_state_dict is None and device_map is not None and "disk" in device_map.values():
        offload_state_dict = True
    load_checkpoint_in_model(
        model,
        checkpoint,
        device_map=device_map,
        offload_folder=offload_folder,
        dtype=dtype,
        offload_state_dict=offload_state_dict,
        offload_buffers=offload_buffers,
        strict=strict,
        full_state_dict=full_state_dict,
        broadcast_from_rank0=broadcast_from_rank0,
    )
    if device_map is None:
        return model
    return dispatch_model(
        model,
        device_map=device_map,
        offload_dir=offload_folder,
        offload_buffers=offload_buffers,
        skip_keys=skip_keys,
        preload_module_classes=preload_module_classes,
        force_hooks=force_hooks,
    )


def attach_layerwise_casting_hooks(
    module: torch.nn.Module,
    storage_dtype: torch.dtype,
    compute_dtype: torch.dtype,
    skip_modules_pattern: Optional[Union[str, tuple[str, ...]]] = None,
    skip_modules_classes: Optional[tuple[type[torch.nn.Module], ...]] = None,
    non_blocking: bool = False,
) -> None:
    r"""
    Applies layerwise casting to a given module. The module expected here is a PyTorch `nn.Module`. This is helpful for
    reducing memory requirements when one doesn't want to fully quantize a model. Model params can be kept in say,
    `torch.float8_e4m3fn` and upcasted to a higher precision like `torch.bfloat16` during forward pass and downcasted
    back to `torch.float8_e4m3fn` to realize memory savings.

    Args:
        module (`torch.nn.Module`):
            The module whose leaf modules will be cast to a high precision dtype for computation, and to a low
            precision dtype for storage.
        storage_dtype (`torch.dtype`):
            The dtype to cast the module to before/after the forward pass for storage.
        compute_dtype (`torch.dtype`):
            The dtype to cast the module to during the forward pass for computation.
        skip_modules_pattern (`tuple[str, ...]`, defaults to `None`):
            A list of patterns to match the names of the modules to skip during the layerwise casting process. If set
            to `None` alongside `skip_modules_classes` being `None`, the layerwise casting is applied directly to the
            module instead of its internal submodules.
        skip_modules_classes (`tuple[type[torch.nn.Module], ...]`, defaults to `None`):
            A list of module classes to skip during the layerwise casting process.
        non_blocking (`bool`, defaults to `False`):
            If `True`, the weight casting operations are non-blocking.

    Example:

    ```python
    >>> from accelerate.hooks import attach_layerwise_casting_hooks
    >>> from transformers import AutoModelForCausalLM
    >>> import torch

    >>> # Model
    >>> checkpoint = "EleutherAI/gpt-j-6B"
    >>> model = AutoModelForCausalLM.from_pretrained(checkpoint)

    >>> # Attach hooks and perform inference
    >>> attach_layerwise_casting_hooks(model, storage_dtype=torch.float8_e4m3fn, compute_dtype=torch.bfloat16)
    >>> with torch.no_grad():
    ...     model(...)
    ```

    Users can also pass modules they want to avoid from getting downcasted.

    ```py
    >>> attach_layerwise_casting_hooks(
    ...     model, storage_dtype=torch.float8_e4m3fn, compute_dtype=torch.bfloat16, skip_modules_pattern=["norm"]
    ... )
    ```
    """
    _attach_layerwise_casting_hooks(
        module, storage_dtype, compute_dtype, skip_modules_pattern, skip_modules_classes, non_blocking
    )


def _attach_layerwise_casting_hooks(
    module: torch.nn.Module,
    storage_dtype: torch.dtype,
    compute_dtype: torch.dtype,
    skip_modules_pattern: Optional[Union[str, tuple[str, ...]]] = None,
    skip_modules_classes: Optional[tuple[type[torch.nn.Module], ...]] = None,
    non_blocking: bool = False,
    _prefix: str = "",
):
    should_skip = (skip_modules_classes is not None and isinstance(module, skip_modules_classes)) or (
        skip_modules_pattern is not None and any(re.search(pattern, _prefix) for pattern in skip_modules_pattern)
    )
    if should_skip:
        logger.debug(f'Skipping layerwise casting for layer "{_prefix}"')
        return

    if isinstance(module, SUPPORTED_PYTORCH_LAYERS_FOR_UPCASTING):
        logger.debug(f'Applying layerwise casting to layer "{_prefix}"')
        add_hook_to_module(
            module,
            LayerwiseCastingHook(storage_dtype=storage_dtype, compute_dtype=compute_dtype, non_blocking=non_blocking),
            append=True,
        )
        return

    for name, submodule in module.named_children():
        layer_name = f"{_prefix}.{name}" if _prefix else name
        _attach_layerwise_casting_hooks(
            submodule,
            storage_dtype,
            compute_dtype,
            skip_modules_pattern,
            skip_modules_classes,
            non_blocking,
            _prefix=layer_name,
        )


def _attach_context_parallel_hooks(
    model: nn.Module,
):
    """
    Monkeypatch huggingface's `transformers` model to fix attention mask issues when using context parallelism.

    This function attaches forward_pre_hooks to each self_attn module of the model, where each hook checks the
    args/kwargs, if they contain an attention mask, if it does, it will remove this mask, check if it is a causal mask,
    if yes, will add a kwarg `is_causal=True`, otherwise will raise an error. This is because context parallelism does
    not support attention masks. This function modifies the model in place.

    Args:
        model (`nn.Module`):
            The model to attach the hooks to.

    """

    def _self_attn_pre_forward_hook(_module, module_args, module_kwargs):
        if "attention_mask" in module_kwargs:
            module_kwargs["attention_mask"] = None
            module_kwargs["is_causal"] = True

        return module_args, module_kwargs

    for name, module in model.named_modules():
        # We hope (assume) that if user uses their own model (without this structure which transformers uses), they read the docs saying they can't pass in attention masks
        # Then these cases can happen:
        # 1) some modules end with a `self-attn` module, in which case we attach the hook, but the
        #    there's no attention mask kwarg -> hook is a no-op
        # 2) some modules end with a `self-attn` module, in which case we attach the hook, and the
        #    attention mask kwarg is passed -> hook will remove the attention mask and add
        #    `is_causal=True` kwarg, which either crashes the training or fixes it
        #    (training would crash anyway as attention mask isn't supported)
        # 3) no modules end with a `self-attn` module, in which case we don't attach the hook, this is
        #    a no-op as well
        if name.endswith("self_attn"):
            # we want the hook to be executed first, to avoid any other hooks doing work on the attention mask
            module.register_forward_pre_hook(_self_attn_pre_forward_hook, with_kwargs=True, prepend=True)


================================================
FILE: src/accelerate/checkpointing.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import random
from pathlib import Path
from typing import Optional

import numpy as np
import torch
from safetensors.torch import load_model

from .utils import (
    MODEL_NAME,
    OPTIMIZER_NAME,
    RNG_STATE_NAME,
    SAFE_MODEL_NAME,
    SAFE_WEIGHTS_NAME,
    SAMPLER_NAME,
    SCALER_NAME,
    SCHEDULER_NAME,
    WEIGHTS_NAME,
    get_pretty_name,
    is_cuda_available,
    is_hpu_available,
    is_mlu_available,
    is_musa_available,
    is_neuron_available,
    is_sdaa_available,
    is_torch_version,
    is_torch_xla_available,
    is_xpu_available,
    load,
    save,
)


if is_torch_version(">=", "2.4.0"):
    from torch.amp import GradScaler
else:
    from torch.cuda.amp import GradScaler

if is_torch_xla_available():
    import torch_xla.core.xla_model as xm

from .logging import get_logger
from .state import PartialState


logger = get_logger(__name__)


def save_accelerator_state(
    output_dir: str,
    model_states: list[dict],
    optimizers: list,
    schedulers: list,
    dataloaders: list,
    process_index: int,
    step: int,
    scaler: Optional[GradScaler] = None,
    save_on_each_node: bool = False,
    safe_serialization: bool = True,
):
    """
    Saves the current states of the models, optimizers, scaler, and RNG generators to a given directory.

    <Tip>

    If `safe_serialization` is `True`, models will be saved with `safetensors` while the rest are saved using native
    `pickle`.

    </Tip>

    Args:
        output_dir (`str` or `os.PathLike`):
            The name of the folder to save all relevant weights and states.
        model_states (`List[torch.nn.Module]`):
            A list of model states
        optimizers (`List[torch.optim.Optimizer]`):
            A list of optimizer instances
        schedulers (`List[torch.optim.lr_scheduler._LRScheduler]`):
            A list of learning rate schedulers
        dataloaders (`List[torch.utils.data.DataLoader]`):
            A list of dataloader instances to save their sampler states
        process_index (`int`):
            The current process index in the Accelerator state
        step (`int`):
            The current step in the internal step tracker
        scaler (`torch.amp.GradScaler`, *optional*):
            An optional gradient scaler instance to save;
        save_on_each_node (`bool`, *optional*):
            Whether to save on every node, or only the main node.
        safe_serialization (`bool`, *optional*, defaults to `True`):
            Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
    """
    output_dir = Path(output_dir)
    # Model states
    for i, state in enumerate(model_states):
        weights_name = WEIGHTS_NAME if not safe_serialization else SAFE_WEIGHTS_NAME
        if i > 0:
            weights_name = weights_name.replace(".", f"_{i}.")
        output_model_file = output_dir.joinpath(weights_name)
        save(state, output_model_file, save_on_each_node=save_on_each_node, safe_serialization=safe_serialization)
        logger.info(f"Model weights saved in {output_model_file}")
    # Optimizer states
    for i, opt in enumerate(optimizers):
        state = opt.state_dict()
        optimizer_name = f"{OPTIMIZER_NAME}.bin" if i == 0 else f"{OPTIMIZER_NAME}_{i}.bin"
        output_optimizer_file = output_dir.joinpath(optimizer_name)
        save(state, output_optimizer_file, save_on_each_node=save_on_each_node, safe_serialization=False)
        logger.info(f"Optimizer state saved in {output_optimizer_file}")
    # Scheduler states
    for i, scheduler in enumerate(schedulers):
        state = scheduler.state_dict()
        scheduler_name = f"{SCHEDULER_NAME}.bin" if i == 0 else f"{SCHEDULER_NAME}_{i}.bin"
        output_scheduler_file = output_dir.joinpath(scheduler_name)
        save(state, output_scheduler_file, save_on_each_node=save_on_each_node, safe_serialization=False)
        logger.info(f"Scheduler state saved in {output_scheduler_file}")
    # DataLoader states
    for i, dataloader in enumerate(dataloaders):
        sampler_name = f"{SAMPLER_NAME}.bin" if i == 0 else f"{SAMPLER_NAME}_{i}.bin"
        output_sampler_file = output_dir.joinpath(sampler_name)
        # Only save if we have our custom sampler
        from .data_loader import IterableDatasetShard, SeedableRandomSampler

        if isinstance(dataloader.dataset, IterableDatasetShard):
            sampler = dataloader.get_sampler()
            if isinstance(sampler, SeedableRandomSampler):
                save(sampler, output_sampler_file, save_on_each_node=save_on_each_node, safe_serialization=False)
        if getattr(dataloader, "use_stateful_dataloader", False):
            dataloader_state_dict_name = "dl_state_dict.bin" if i == 0 else f"dl_state_dict_{i}.bin"
            output_dataloader_state_dict_file = output_dir.joinpath(dataloader_state_dict_name)
            state_dict = dataloader.state_dict()
            torch.save(state_dict, output_dataloader_state_dict_file)
        logger.info(f"Sampler state for dataloader {i} saved in {output_sampler_file}")

    # GradScaler state
    if scaler is not None:
        state = scaler.state_dict()
        output_scaler_file = output_dir.joinpath(SCALER_NAME)
        torch.save(state, output_scaler_file)
        logger.info(f"Gradient scaler state saved in {output_scaler_file}")
    # Random number generator states
    states = {}
    states_name = f"{RNG_STATE_NAME}_{process_index}.pkl"
    states["step"] = step
    states["random_state"] = random.getstate()
    states["numpy_random_seed"] = np.random.get_state()
    states["torch_manual_seed"] = torch.get_rng_state()
    if is_xpu_available():
        states["torch_xpu_manual_seed"] = torch.xpu.get_rng_state_all()
    if is_mlu_available():
        states["torch_mlu_manual_seed"] = torch.mlu.get_rng_state_all()
    elif is_sdaa_available():
        states["torch_sdaa_manual_seed"] = torch.sdaa.get_rng_state_all()
    elif is_musa_available():
        states["torch_musa_manual_seed"] = torch.musa.get_rng_state_all()
    if is_hpu_available():
        states["torch_hpu_manual_seed"] = torch.hpu.get_rng_state_all()
    if is_neuron_available():
        states["torch_neuron_manual_seed"] = torch.neuron.get_rng_state_all()
    if is_cuda_available():
        states["torch_cuda_manual_seed"] = torch.cuda.get_rng_state_all()
    if is_torch_xla_available():
        states["xm_seed"] = xm.get_rng_state()
    output_states_file = output_dir.joinpath(states_name)
    torch.save(states, output_states_file)
    logger.info(f"Random states saved in {output_states_file}")
    return output_dir


def load_accelerator_state(
    input_dir,
    models,
    optimizers,
    schedulers,
    dataloaders,
    process_index,
    scaler=None,
    map_location=None,
    load_kwargs=None,
    **load_model_func_kwargs,
):
    """
    Loads states of the models, optimizers, scaler, and RNG generators from a given directory.

    Args:
        input_dir (`str` or `os.PathLike`):
            The name of the folder to load all relevant weights and states.
        models (`List[torch.nn.Module]`):
            A list of model instances
        optimizers (`List[torch.optim.Optimizer]`):
            A list of optimizer instances
        schedulers (`List[torch.optim.lr_scheduler._LRScheduler]`):
            A list of learning rate schedulers
        process_index (`int`):
            The current process index in the Accelerator state
        scaler (`torch.amp.GradScaler`, *optional*):
            An optional *GradScaler* instance to load
        map_location (`str`, *optional*):
            What device to load the optimizer state onto. Should be one of either "cpu" or "on_device".
        load_kwargs (`dict`, *optional*):
            Additional arguments that can be passed to the `load` function.
        load_model_func_kwargs (`dict`, *optional*):
            Additional arguments that can be passed to the model's `load_state_dict` method.

    Returns:
        `dict`: Contains the `Accelerator` attributes to override while loading the state.
    """
    # stores the `Accelerator` attributes to override
    override_attributes = dict()
    if map_location not in [None, "cpu", "on_device"]:
        raise TypeError(
            "Unsupported optimizer map location passed, please choose one of `None`, `'cpu'`, or `'on_device'`"
        )
    if map_location is None:
        map_location = "cpu"
    elif map_location == "on_device":
        map_location = PartialState().device

    if load_kwargs is None:
        load_kwargs = {}

    input_dir = Path(input_dir)
    # Model states
    for i, model in enumerate(models):
        ending = f"_{i}" if i > 0 else ""
        input_model_file = input_dir.joinpath(f"{SAFE_MODEL_NAME}{ending}.safetensors")
        if input_model_file.exists():
            load_model(model, input_model_file, device=str(map_location), **load_model_func_kwargs)
        else:
            # Load with torch
            input_model_file = input_dir.joinpath(f"{MODEL_NAME}{ending}.bin")
            state_dict = load(input_model_file, map_location=map_location)
            model.load_state_dict(state_dict, **load_model_func_kwargs)
    logger.info("All model weights loaded successfully")

    # Optimizer states
    for i, opt in enumerate(optimizers):
        optimizer_name = f"{OPTIMIZER_NAME}.bin" if i == 0 else f"{OPTIMIZER_NAME}_{i}.bin"
        input_optimizer_file = input_dir.joinpath(optimizer_name)
        optimizer_state = load(input_optimizer_file, map_location=map_location, **load_kwargs)
        optimizers[i].load_state_dict(optimizer_state)
    logger.info("All optimizer states loaded successfully")

    # Scheduler states
    for i, scheduler in enumerate(schedulers):
        scheduler_name = f"{SCHEDULER_NAME}.bin" if i == 0 else f"{SCHEDULER_NAME}_{i}.bin"
        input_scheduler_file = input_dir.joinpath(scheduler_name)
        scheduler_state = load(input_scheduler_file, **load_kwargs)
        scheduler.load_state_dict(scheduler_state)
    logger.info("All scheduler states loaded successfully")

    for i, dataloader in enumerate(dataloaders):
        sampler_name = f"{SAMPLER_NAME}.bin" if i == 0 else f"{SAMPLER_NAME}_{i}.bin"
        input_sampler_file = input_dir.joinpath(sampler_name)
        # Only load if we have our custom sampler
        from .data_loader import IterableDatasetShard, SeedableRandomSampler

        if isinstance(dataloader.dataset, IterableDatasetShard):
            sampler = dataloader.get_sampler()
            if isinstance(sampler, SeedableRandomSampler):
                sampler = dataloader.set_sampler(load(input_sampler_file))
        if getattr(dataloader, "use_stateful_dataloader", False):
            dataloader_state_dict_name = "dl_state_dict.bin" if i == 0 else f"dl_state_dict_{i}.bin"
            input_dataloader_state_dict_file = input_dir.joinpath(dataloader_state_dict_name)
            if input_dataloader_state_dict_file.exists():
                state_dict = load(input_dataloader_state_dict_file, **load_kwargs)
                dataloader.load_state_dict(state_dict)
    logger.info("All dataloader sampler states loaded successfully")

    # GradScaler state
    if scaler is not None:
        input_scaler_file = input_dir.joinpath(SCALER_NAME)
        scaler_state = load(input_scaler_file)
        scaler.load_state_dict(scaler_state)
        logger.info("GradScaler state loaded successfully")

    # Random states
    try:
        states = load(input_dir.joinpath(f"{RNG_STATE_NAME}_{process_index}.pkl"))
        if "step" in states:
            override_attributes["step"] = states["step"]
        random.setstate(states["random_state"])
        np.random.set_state(states["numpy_random_seed"])
        torch.set_rng_state(states["torch_manual_seed"])
        if is_xpu_available():
            torch.xpu.set_rng_state_all(states["torch_xpu_manual_seed"])
        if is_mlu_available():
            torch.mlu.set_rng_state_all(states["torch_mlu_manual_seed"])
        elif is_sdaa_available():
            torch.sdaa.set_rng_state_all(states["torch_sdaa_manual_seed"])
        elif is_musa_available():
            torch.musa.set_rng_state_all(states["torch_musa_manual_seed"])
        elif is_hpu_available():
            torch.hpu.set_rng_state_all(states["torch_hpu_manual_seed"])
        elif is_neuron_available():
            torch.neuron.set_rng_state_all(states["torch_neuron_manual_seed"])
        else:
            torch.cuda.set_rng_state_all(states["torch_cuda_manual_seed"])
        if is_torch_xla_available():
            xm.set_rng_state(states["xm_seed"])
        logger.info("All random states loaded successfully")
    except Exception:
        logger.info("Could not load random states")

    return override_attributes


def save_custom_state(obj, path, index: int = 0, save_on_each_node: bool = False):
    """
    Saves the state of `obj` to `{path}/custom_checkpoint_{index}.pkl`
    """
    # Should this be the right way to get a qual_name type value from `obj`?
    save_location = Path(path) / f"custom_checkpoint_{index}.pkl"
    logger.info(f"Saving the state of {get_pretty_name(obj)} to {save_location}")
    save(obj.state_dict(), save_location, save_on_each_node=save_on_each_node)


def load_custom_state(obj, path, index: int = 0):
    """
    Loads the state of `obj` at `{path}/custom_checkpoint_{index}.pkl`. Will always set `weights_only=False` when
    loading the state.
    """
    load_location = f"{path}/custom_checkpoint_{index}.pkl"
    logger.info(f"Loading the state of {get_pretty_name(obj)} from {load_location}")
    obj.load_state_dict(load(load_location, map_location="cpu", weights_only=False))


================================================
FILE: src/accelerate/commands/__init__.py
================================================
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: src/accelerate/commands/accelerate_cli.py
================================================
#!/usr/bin/env python

# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from accelerate.commands.config import get_config_parser
from accelerate.commands.env import env_command_parser
from accelerate.commands.estimate import estimate_command_parser
from accelerate.commands.launch import launch_command_parser
from accelerate.commands.merge import merge_command_parser
from accelerate.commands.test import test_command_parser
from accelerate.commands.to_fsdp2 import to_fsdp2_command_parser
from accelerate.commands.tpu import tpu_command_parser
from accelerate.commands.utils import CustomArgumentParser


def main():
    parser = CustomArgumentParser("Accelerate CLI tool", usage="accelerate <command> [<args>]", allow_abbrev=False)
    subparsers = parser.add_subparsers(help="accelerate command helpers")

    # Register commands
    get_config_parser(subparsers=subparsers)
    estimate_command_parser(subparsers=subparsers)
    env_command_parser(subparsers=subparsers)
    launch_command_parser(subparsers=subparsers)
    merge_command_parser(subparsers=subparsers)
    tpu_command_parser(subparsers=subparsers)
    test_command_parser(subparsers=subparsers)
    to_fsdp2_command_parser(subparsers=subparsers)

    # Let's go
    args = parser.parse_args()

    if not hasattr(args, "func"):
        parser.print_help()
        exit(1)

    # Run
    args.func(args)


if __name__ == "__main__":
    main()


================================================
FILE: src/accelerate/commands/config/__init__.py
================================================
#!/usr/bin/env python

# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse

from .config import config_command_parser
from .config_args import default_config_file, load_config_from_file  # noqa: F401
from .default import default_command_parser
from .update import update_command_parser


def get_config_parser(subparsers=None):
    parent_parser = argparse.ArgumentParser(add_help=False, allow_abbrev=False)
    # The main config parser
    config_parser = config_command_parser(subparsers)
    # The subparser to add commands to
    subcommands = config_parser.add_subparsers(title="subcommands", dest="subcommand")

    # Then add other parsers with the parent parser
    default_command_parser(subcommands, parents=[parent_parser])
    update_command_parser(subcommands, parents=[parent_parser])

    return config_parser


def main():
    config_parser = get_config_parser()
    args = config_parser.parse_args()

    if not hasattr(args, "func"):
        config_parser.print_help()
        exit(1)

    # Run
    args.func(args)


if __name__ == "__main__":
    main()


================================================
FILE: src/accelerate/commands/config/cluster.py
================================================
#!/usr/bin/env python

# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os

from ...utils import (
    ComputeEnvironment,
    DistributedType,
    is_deepspeed_available,
    is_fp8_available,
    is_hpu_available,
    is_mlu_available,
    is_mps_available,
    is_msamp_available,
    is_musa_available,
    is_neuron_available,
    is_npu_available,
    is_sdaa_available,
    is_torchao_available,
    is_transformer_engine_available,
    is_transformers_available,
    is_xpu_available,
)
from ...utils.constants import (
    DEEPSPEED_MULTINODE_LAUNCHERS,
    FSDP2_STATE_DICT_TYPE,
    FSDP_AUTO_WRAP_POLICY,
    FSDP_BACKWARD_PREFETCH,
    FSDP_SHARDING_STRATEGY,
    FSDP_STATE_DICT_TYPE,
    TORCH_DYNAMO_MODES,
)
from .config_args import ClusterConfig
from .config_utils import (
    DYNAMO_BACKENDS,
    _ask_field,
    _ask_options,
    _convert_distributed_mode,
    _convert_dynamo_backend,
    _convert_fp8_backend,
    _convert_mixed_precision,
    _convert_yes_no_to_bool,
)


def get_cluster_input():
    distributed_type = _ask_options(
        "Which type of machine are you using?",
        [
            "No distributed training",
            "multi-CPU",
            "multi-XPU",
            "multi-HPU",
            "multi-GPU",
            "multi-NPU",
            "multi-MLU",
            "multi-SDAA",
            "multi-MUSA",
            "multi-NEURON",
            "TPU",
        ],
        _convert_distributed_mode,
    )

    machine_rank = 0
    num_machines = 1
    num_processes = 1
    gpu_ids = None
    main_process_ip = None
    main_process_port = None
    rdzv_backend = "static"
    same_network = True
    debug = False

    if distributed_type in [
        DistributedType.MULTI_GPU,
        DistributedType.MULTI_MLU,
        DistributedType.MULTI_SDAA,
        DistributedType.MULTI_MUSA,
        DistributedType.MULTI_NPU,
        DistributedType.MULTI_XPU,
        DistributedType.MULTI_CPU,
        DistributedType.MULTI_HPU,
        DistributedType.MULTI_NEURON,
    ]:
        num_machines = _ask_field(
            "How many different machines will you use (use more than 1 for multi-node training)? [1]: ",
            int,
            default=1,
        )
        if num_machines > 1:
            machine_rank = _ask_options(
                "What is the rank of this machine?",
                list(range(num_machines)),
                int,
            )
            main_process_ip = _ask_field(
                "What is the IP address of the machine that will host the main process? ",
            )
            main_process_port = _ask_field(
                "What is the port you will use to communicate with the main process? ",
                int,
            )
            same_network = _ask_field(
                "Are all the machines on the same local network? Answer `no` if nodes are on the cloud and/or on different network hosts [YES/no]: ",
                _convert_yes_no_to_bool,
                default=True,
                error_message="Please enter yes or no.",
            )
            if not same_network:
                rdzv_backend = _ask_field(
                    "What rendezvous backend will you use? ('static', 'c10d', ...): ", default="static"
                )
        debug = _ask_field(
            "Should distributed operations be checked while running for errors? This can avoid timeout issues but will be slower. [yes/NO]: ",
            _convert_yes_no_to_bool,
            default=False,
            error_message="Please enter yes or no.",
        )

    if distributed_type == DistributedType.NO:
        use_cpu = _ask_field(
            "Do you want to run your training on CPU only (even if a GPU / Apple Silicon / Ascend NPU device is available)? [yes/NO]:",
            _convert_yes_no_to_bool,
            default=False,
            error_message="Please enter yes or no.",
        )
    elif distributed_type == DistributedType.MULTI_CPU:
        use_cpu = True
    else:
        use_cpu = False

    mpirun_config = {}

    if use_cpu:
        if distributed_type == DistributedType.MULTI_CPU:
            use_mpirun = _ask_field(
                "Do you want accelerate to launch mpirun? [yes/NO]: ",
                _convert_yes_no_to_bool,
                default=False,
                error_message="Please enter yes or no.",
            )
            if use_mpirun:
                mpirun_hostfile = _ask_field(
                    "Please enter the path to the hostfile to use with mpirun [~/hostfile]: ",
                    str,
                    default="~/hostfile",
                )
                mpirun_config["mpirun_hostfile"] = os.path.expanduser(mpirun_hostfile.strip())

    dynamo_config = {}
    use_dynamo = _ask_field(
        "Do you wish to optimize your script with torch dynamo?[yes/NO]:",
        _convert_yes_no_to_bool,
        default=False,
        error_message="Please enter yes or no.",
    )
    if use_dynamo:
        prefix = "dynamo_"
        dynamo_config[prefix + "backend"] = _ask_options(
            "Which dynamo backend would you like to use?",
            [x.lower() for x in DYNAMO_BACKENDS],
            _convert_dynamo_backend,
            default=2,
        )
        use_custom_options = _ask_field(
            "Do you want to customize the defaults sent to torch.compile? [yes/NO]: ",
            _convert_yes_no_to_bool,
            default=False,
            error_message="Please enter yes or no.",
        )

        if use_custom_options:
            dynamo_config[prefix + "mode"] = _ask_options(
                "Which mode do you want to use?",
                TORCH_DYNAMO_MODES,
                lambda x: TORCH_DYNAMO_MODES[int(x)],
                default=0,
            )
            dynamo_config[prefix + "use_fullgraph"] = _ask_field(
                "Do you want the fullgraph mode or it is ok to break model into several subgraphs? [yes/NO]: ",
                _convert_yes_no_to_bool,
                default=False,
                error_message="Please enter yes or no.",
            )
            dynamo_config[prefix + "use_dynamic"] = _ask_field(
                "Do you want to enable dynamic shape tracing? [yes/NO]: ",
                _convert_yes_no_to_bool,
                default=False,
                error_message="Please enter yes or no.",
            )
            dynamo_config[prefix + "use_regional_compilation"] = _ask_field(
                "Do you want to enable regional compilation? [yes/NO]: ",
                _convert_yes_no_to_bool,
                default=False,
                error_message="Please enter yes or no.",
            )

    use_mps = not use_cpu and is_mps_available()
    deepspeed_config = {}
    if (
        distributed_type
        in [
            DistributedType.MULTI_GPU,
            DistributedType.MULTI_XPU,
            DistributedType.MULTI_HPU,
            DistributedType.MULTI_NPU,
            DistributedType.MULTI_MLU,
            DistributedType.MULTI_SDAA,
            DistributedType.MULTI_MUSA,
            DistributedType.MULTI_NEURON,
            DistributedType.NO,
        ]
        and not use_mps
    ):
        use_deepspeed = _ask_field(
            "Do you want to use DeepSpeed? [yes/NO]: ",
            _convert_yes_no_to_bool,
            default=False,
            error_message="Please enter yes or no.",
        )
        if use_deepspeed:
            if distributed_type is DistributedType.MULTI_NEURON:
                raise RuntimeError("DeepSpeed is not supported on Neuron devices.")

            distributed_type = DistributedType.DEEPSPEED
            assert is_deepspeed_available(), (
                "DeepSpeed is not installed => run `pip3 install deepspeed` or build it from source"
            )

        if distributed_type == DistributedType.DEEPSPEED:
            use_deepspeed_config = _ask_field(
                "Do you want to specify a json file to a DeepSpeed config? [yes/NO]: ",
                _convert_yes_no_to_bool,
                default=False,
                error_message="Please enter yes or no.",
            )
            if use_deepspeed_config:
                deepspeed_config["deepspeed_config_file"] = _ask_field(
                    "Please enter the path to the json DeepSpeed config file: ",
                    str,
                    default="none",
                )
            else:
                deepspeed_config["zero_stage"] = _ask_options(
                    "What should be your DeepSpeed's ZeRO optimization stage?",
                    [0, 1, 2, 3],
                    int,
                    default=2,
                )

                deepspeed_devices = ["none", "cpu", "nvme"]
                if deepspeed_config["zero_stage"] >= 2:
                    deepspeed_config["offload_optimizer_device"] = _ask_options(
                        "Where to offload optimizer states?", deepspeed_devices, lambda x: deepspeed_devices[int(x)]
                    )
                    deepspeed_config["offload_param_device"] = _ask_options(
                        "Where to offload parameters?", deepspeed_devices, lambda x: deepspeed_devices[int(x)]
                    )
                    if deepspeed_config["offload_param_device"] == "nvme":
                        deepspeed_config["offload_param_nvme_path"] = _ask_field(
                            "Nvme Path to offload parameters?",
                            str,
                            default="/nvme",
                        )
                    if deepspeed_config["offload_optimizer_device"] == "nvme":
                        deepspeed_config["offload_optimizer_nvme_path"] = _ask_field(
                            "Nvme Path to offload optimizer states?",
                            str,
                            default="/nvme",
                        )
                deepspeed_config["gradient_accumulation_steps"] = _ask_field(
                    "How many gradient accumulation steps you're passing in your script? [1]: ",
                    int,
                    default=1,
                )
                use_gradient_clipping = _ask_field(
                    "Do you want to use gradient clipping? [yes/NO]: ",
                    _convert_yes_no_to_bool,
                    default=False,
                    error_message="Please enter yes or no.",
                )
                if use_gradient_clipping:
                    deepspeed_config["gradient_clipping"] = _ask_field(
                        "What is the gradient clipping value? [1.0]: ",
                        float,
                        default=1.0,
                    )
                if deepspeed_config["zero_stage"] == 3:
                    deepspeed_config["zero3_save_16bit_model"] = _ask_field(
                        "Do you want to save 16-bit model weights when using ZeRO Stage-3? [yes/NO]: ",
                        _convert_yes_no_to_bool,
                        default=False,
                        error_message="Please enter yes or no.",
                    )
            deepspeed_config["zero3_init_flag"] = _ask_field(
                "Do you want to enable `deepspeed.zero.Init` when using ZeRO Stage-3 for constructing massive models? [yes/NO]: ",
                _convert_yes_no_to_bool,
                default=False,
                error_message="Please enter yes or no.",
            )
            if deepspeed_config["zero3_init_flag"]:
                if not is_transformers_available():
                    raise Exception(
                        "When `zero3_init_flag` is set, it requires Transformers to be installed. "
                        "Please run `pip3 install transformers`."
                    )
            use_moe = _ask_field(
                "Do you want to enable Mixture-of-Experts training (MoE)? [yes/NO]: ",
                _convert_yes_no_to_bool,
                default=False,
                error_message="Please enter yes or no.",
            )
            if use_moe:
                deepspeed_config["deepspeed_moe_layer_cls_names"] = _ask_field(
                    "Specify the comma-separated list of transformers MoE layer class names (case-sensitive), e.g : "
                    " `MixtralSparseMoeBlock`, `Qwen2MoeSparseMoeBlock`, `JetMoEAttention,JetMoEBlock` ... : ",
                    str,
                )

            if num_machines > 1:
                launcher_query = "Which Type of launcher do you want to use?"
                deepspeed_config["deepspeed_multinode_launcher"] = _ask_options(
                    launcher_query,
                    DEEPSPEED_MULTINODE_LAUNCHERS,
                    lambda x: DEEPSPEED_MULTINODE_LAUNCHERS[int(x)],
                )

                if deepspeed_config["deepspeed_multinode_launcher"] != DEEPSPEED_MULTINODE_LAUNCHERS[1]:
                    deepspeed_config["deepspeed_hostfile"] = _ask_field(
                        "DeepSpeed configures multi-node compute resources with hostfile. "
                        "Each row is of the format `hostname slots=[num_gpus]`, e.g., `localhost slots=2`; "
                        "for more information please refer official [documentation]"
                        "(https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node). "
                        "Please specify the location of hostfile: ",
                        str,
                    )

                    is_exclusion_filter = _ask_field(
                        "Do you want to specify exclusion filter string? [yes/NO]: ",
                        _convert_yes_no_to_bool,
                        default=False,
                        error_message="Please enter yes or no.",
                    )
                    if is_exclusion_filter:
                        deepspeed_config["deepspeed_exclusion_filter"] = _ask_field(
                            "DeepSpeed exclusion filter string: ",
                            str,
                        )

                    is_inclusion_filter = _ask_field(
                        "Do you want to specify inclusion filter string? [yes/NO]: ",
                        _convert_yes_no_to_bool,
                        default=False,
                        error_message="Please enter yes or no.",
                    )
                    if is_inclusion_filter:
                        deepspeed_config["deepspeed_inclusion_filter"] = _ask_field(
                            "DeepSpeed inclusion filter string: ",
                            str,
                        )

    fsdp_config = {}

    if distributed_type in [
        DistributedType.MULTI_GPU,
        DistributedType.MULTI_NPU,
        DistributedType.MULTI_MLU,
        DistributedType.MULTI_SDAA,
        DistributedType.MULTI_MUSA,
        DistributedType.MULTI_XPU,
        DistributedType.MULTI_HPU,
        DistributedType.MULTI_NEURON,
    ]:
        use_fsdp = _ask_field(
            "Do you want to use FullyShardedDataParallel? [yes/NO]: ",
            _convert_yes_no_to_bool,
            default=False,
            error_message="Please enter yes or no.",
        )
        if use_fsdp:
            if distributed_type is DistributedType.MULTI_NEURON:
                raise NotImplementedError("FSDP is not currently supported on Neuron devices.")
            distributed_type = DistributedType.FSDP

        if distributed_type == DistributedType.FSDP:
            fsdp_config["fsdp_version"] = _ask_options(
                "What should be your FSDP version? [2]: ",
                [1, 2],
                lambda x: int(x) + 1,
                default=1,
            )
            fsdp_version = fsdp_config["fsdp_version"]  # extract to a variable to simplify usage later

            if fsdp_version == 1:
                sharding_strategy_query = "What should be your sharding strategy?"
                fsdp_config["fsdp_reshard_after_forward"] = _ask_options(
                    sharding_strategy_query,
                    FSDP_SHARDING_STRATEGY,
                    lambda x: FSDP_SHARDING_STRATEGY[int(x)],
                )
            else:
                fsdp_config["fsdp_reshard_after_forward"] = _ask_field(
                    "Do you want to enable resharding after forward? [YES/no]: ",
                    _convert_yes_no_to_bool,
                    default=True,
                    error_message="Please enter yes or no.",
                )

            fsdp_config["fsdp_offload_params"] = _ask_field(
                "Do you want to offload parameters and gradients to CPU? [yes/NO]: ",
                _convert_yes_no_to_bool,
                default=False,
                error_message="Please enter yes or no.",
            )

            fsdp_wrap_query = "What should be your auto wrap policy?"
            fsdp_config["fsdp_auto_wrap_policy"] = _ask_options(
                fsdp_wrap_query,
                FSDP_AUTO_WRAP_POLICY,
                lambda x: FSDP_AUTO_WRAP_POLICY[int(x)],
            )
            if fsdp_config["fsdp_auto_wrap_policy"] == FSDP_AUTO_WRAP_POLICY[0]:
                use_no_split_modules = _ask_field(
                    "Do you want to use the model's `_no_split_modules` to wrap. Only applicable for 🤗 Transformers [yes/NO]: ",
                    _convert_yes_no_to_bool,
                    default=False,
                    error_message="Please enter yes or no.",
                )
                if not use_no_split_modules:
                    fsdp_config["fsdp_transformer_layer_cls_to_wrap"] = _ask_field(
                        "Specify the comma-separated list of transformer layer class names (case-sensitive) to wrap ,e.g, :"
                        "`BertLayer`, `GPTJBlock`, `T5Block`, `BertLayer,BertEmbeddings,BertSelfOutput` ...? : ",
                        str,
                    )
            elif fsdp_config["fsdp_auto_wrap_policy"] == FSDP_AUTO_WRAP_POLICY[1]:
                fsdp_config["fsdp_min_num_params"] = _ask_field(
                    "What should be your FSDP's minimum number of parameters for Default Auto Wrapping Policy? [1e8]: ",
                    int,
                    default=100000000,
                )
            # Removed in FSDP2, ask for user input for FSDP1
            if fsdp_version == 1:
                fsdp_backward_prefetch_query = "What should be your FSDP's backward prefetch policy?"
                fsdp_config["fsdp_backward_prefetch"] = _ask_options(
                    fsdp_backward_prefetch_query,
                    FSDP_BACKWARD_PREFETCH,
                    lambda x: FSDP_BACKWARD_PREFETCH[int(x)],
                )

            fsdp_state_dict_type_query = "What should be your FSDP's state dict type?"
            fsdp_config["fsdp_state_dict_type"] = _ask_options(
                fsdp_state_dict_type_query,
                FSDP_STATE_DICT_TYPE if fsdp_version == 1 else FSDP2_STATE_DICT_TYPE,
                lambda x: FSDP_STATE_DICT_TYPE[int(x)] if fsdp_version == 1 else FSDP2_STATE_DICT_TYPE[int(x)],
                default=0,
            )
            # Not implemented in FSDP2, ask for user input for FSDP1
            if fsdp_version == 1:
                fsdp_config["fsdp_forward_prefetch"] = _ask_field(
                    "Do you want to enable FSDP's forward prefetch policy? [yes/NO]: ",
                    _convert_yes_no_to_bool,
                    default=False,
                    error_message="Please enter yes or no.",
                )
            # Obsolete in FSDP2, ask for user input for FSDP1
            if fsdp_version == 1:
                fsdp_config["fsdp_use_orig_params"] = _ask_field(
                    "Do you want to enable FSDP's `use_orig_params` feature? [YES/no]: ",
                    _convert_yes_no_to_bool,
                    default=True,
                    error_message="Please enter yes or no.",
                )
            fsdp_config["fsdp_cpu_ram_efficient_loading"] = _ask_field(
                "Do you want to enable CPU RAM efficient model loading? Only applicable for 🤗 Transformers models. [YES/no]: ",
                _convert_yes_no_to_bool,
                default=True,
                error_message="Please enter yes or no.",
            )
            # Obsolete in FSDP2, ask for user input for FSDP1
            if fsdp_version == 1:
                if fsdp_config["fsdp_cpu_ram_efficient_loading"]:
                    fsdp_config["fsdp_sync_module_states"] = True
                else:
                    fsdp_config["fsdp_sync_module_states"] = _ask_field(
                        "Do you want each individually wrapped FSDP unit to broadcast module parameters from rank 0 at the start? [YES/no]: ",
                        _convert_yes_no_to_bool,
                        default=True,
                        error_message="Please enter yes or no.",
                    )
            fsdp_config["fsdp_activation_checkpointing"] = _ask_field(
                "Do you want to enable FSDP activation checkpointing? [yes/NO]: ",
                _convert_yes_no_to_bool,
                default=False,
                error_message="Please enter yes or no.",
            )

    parallelism_config = {}

    if fsdp_config.get("fsdp_version", 1) == 2:
        use_parallelism_config = _ask_field(
            "Do you want to use the parallelism config? [yes/NO]: ",
            _convert_yes_no_to_bool,
            default=False,
            error_message="Please enter yes or no.",
        )

        if use_parallelism_config:
            prefix = "parallelism_config_"
            parallelism_config[prefix + "dp_replicate_size"] = _ask_field(
                "What is the data parallelism replicate size? [1]: ",
                int,
                default=1,
                error_message="Please enter an integer.",
            )

            parallelism_config[prefix + "dp_shard_size"] = _ask_field(
                "What is the FSDP shard size? [1]: ",
                int,
                default=1,
                error_message="Please enter an integer.",
            )

            parallelism_config[prefix + "tp_size"] = _ask_field(
                "What is the tensor parallelism size? [1]: ",
                int,
                default=1,
                error_message="Please enter an integer.",
            )

            parallelism_config[prefix + "cp_size"] = _ask_field(
                "What is the context parallelism size? [1]: ",
                int,
                default=1,
                error_message="Please enter an integer.",
            )
            if parallelism_config[prefix + "cp_size"] > 1:
                parallelism_config[prefix + "cp_comm_strategy"] = _ask_options(
                    "What is the compute parallelism communication strategy?",
                    ["allgather", "alltoall"],
                    lambda x: ["allgather", "alltoall"][int(x)],
                    default=0,
                )

    megatron_lm_config = {}
    if distributed_type in [DistributedType.MULTI_GPU]:
        use_megatron_lm = _ask_field(
            "Do you want to use Megatron-LM ? [yes/NO]: ",
            _convert_yes_no_to_bool,
            default=False,
            error_message="Please enter yes or no.",
        )
        if use_megatron_lm:
            distributed_type = DistributedType.MEGATRON_LM
        if distributed_type == DistributedType.MEGATRON_LM:
            prefix = "megatron_lm_"
            megatron_lm_config[prefix + "tp_degree"] = _ask_field(
                "What is the Tensor Parallelism degree/size? [1]:",
                int,
                default=1,
                error_message="Please enter an integer.",
            )
            if megatron_lm_config[prefix + "tp_degree"] > 1:
                megatron_lm_config[prefix + "sequence_parallelism"] = _ask_field(
                    "Do you want to enable Sequence Parallelism? [YES/no]: ",
                    _convert_yes_no_to_bool,
                    default=True,
                    error_message="Please enter yes or no.",
                )

            megatron_lm_config[prefix + "pp_degree"] = _ask_field(
                "What is the Pipeline Parallelism degree/size? [1]:",
                int,
                default=1,
                error_message="Please enter an integer.",
            )
            if megatron_lm_config[prefix + "pp_degree"] > 1:
                megatron_lm_config[prefix + "num_micro_batches"] = _ask_field(
                    "What is the number of micro-batches? [1]:",
                    int,
                    default=1,
                    error_message="Please enter an integer.",
                )

            megatron_lm_config[prefix + "recompute_activations"] = _ask_field(
                "Do you want to enable selective activation recomputation? [YES/no]: ",
                _convert_yes_no_to_bool,
                default=True,
                error_message="Please enter yes or no.",
            )

            megatron_lm_config[prefix + "use_distributed_optimizer"] = _ask_field(
                "Do you want to use distributed optimizer "
                "which shards optimizer state and gradients across data parallel ranks? [YES/no]: ",
                _convert_yes_no_to_bool,
                default=True,
                error_message="Please enter yes or no.",
            )

            megatron_lm_config[prefix + "gradient_clipping"] = _ask_field(
                "What is the gradient clipping value based on global L2 Norm (0 to disable)? [1.0]: ",
                float,
                default=1.0,
            )
    # TPU specific defaults
    tpu_commands = None
    tpu_command_file = None
    tpu_downcast_bf16 = "no"
    tpu_env = []
    tpu_name = None
    tpu_vm = None
    tpu_zone = None
    tpu_use_sudo = False
    tpu_use_cluster = False

    if distributed_type in [
        DistributedType.MULTI_CPU,
        DistributedType.MULTI_XPU,
        DistributedType.MULTI_HPU,
        DistributedType.MULTI_GPU,
        DistributedType.MULTI_MLU,
        DistributedType.MULTI_SDAA,
        DistributedType.MULTI_MUSA,
        DistributedType.MULTI_NPU,
        DistributedType.MULTI_NEURON,
        DistributedType.XLA,
    ]:
        machine_type = str(distributed_type).split(".")[1].replace("MULTI_", "")
        if machine_type in ["TPU", "NEURON"]:
            machine_type += " cores"
        elif machine_type == "CPU":
            machine_type = "processes"
        else:
            machine_type += "(s)"
        num_processes = _ask_field(
            f"How many {machine_type} should be used for distributed training? [1]:",
            int,
            default=1,
            error_message="Please enter an integer.",
        )
    elif distributed_type in [DistributedType.FSDP, DistributedType.DEEPSPEED, DistributedType.MEGATRON_LM]:
        num_processes = _ask_field(
            "How many GPU(s) should be used for distributed training? [1]:",
            int,
            default=1,
            error_message="Please enter an integer.",
        )
    else:
        num_processes = 1

    if (distributed_type == DistributedType.MULTI_GPU) and (num_machines == 1) and (num_processes == 1):
        raise ValueError(
            f"Specified distributed type {distributed_type} but only using 1 GPU on a single machine. Please select `No distributed training` for the type of machine you are using."
        )

    if (
        distributed_type
        in [
            DistributedType.MULTI_GPU,
            DistributedType.MULTI_MLU,
            DistributedType.MULTI_SDAA,
            DistributedType.MULTI_MUSA,
            DistributedType.MULTI_NPU,
            DistributedType.MULTI_XPU,
            DistributedType.MULTI_HPU,
            DistributedType.MULTI_NEURON,
            DistributedType.NO,
        ]
        and not use_cpu
        and not use_mps
    ):
        if is_npu_available():
            machine_type = "NPU(s)"
        elif is_mlu_available():
            machine_type = "MLU(s)"
        elif is_sdaa_available():
            machine_type = "SDAA(s)"
        elif is_musa_available():
            machine_type = "MUSA(s)"
        elif is_xpu_available():
            machine_type = "XPU(s)"
        elif is_hpu_available():
            machine_type = "HPU(s)"
        elif is_neuron_available():
            machine_type = "Neuron cores"
        else:
            machine_type = "GPU(s)"
        gpu_ids = _ask_field(
            f"What {machine_type} (by id) should be used for training on this machine as a comma-separated list? [all]:",
            default="all",
        )

    # CPU affinity is only supported on NVIDIA hardware for now
    enable_cpu_affinity = False
    if distributed_type in (DistributedType.NO, DistributedType.MULTI_GPU) and not use_cpu and not use_mps:
        enable_cpu_affinity = _ask_field(
            "Would you like to enable numa efficiency? (Currently only supported on NVIDIA hardware). [yes/NO]: ",
            _convert_yes_no_to_bool,
            default=False,
            error_message="Please enter yes or no.",
        )

    fp8_config = None
    if distributed_type == DistributedType.XLA:
        mixed_precision = "no"
        main_training_function = _ask_field(
            "What is the name of the function in your script that should be launched in all parallel scripts? [main]: ",
            default="main",
        )
        tpu_use_cluster = _ask_field(
            "Are you using a TPU cluster? [yes/NO]: ",
            _convert_yes_no_to_bool,
            default=False,
            error_message="Please enter yes or no.",
        )
        if tpu_use_cluster:
            tpu_name = _ask_field(
                "What is the name of your TPU cluster? ",
                default=None,
                error_message="Please enter the name of your TPU cluster.",
            )
            tpu_zone = _ask_field(
                "What is the zone of your TPU cluster? ",
                default=None,
                error_message="Please enter the zone of your TPU cluster.",
            )
            tpu_use_sudo = _ask_field(
                "To run a python script in a TPU pod, should `sudo` be used? [yes/NO]: ",
                default=False,
                error_message="Please enter yes or no.",
            )
            run_commands = _ask_field(
                "Do you have code you wish to run on startup in each pod? [yes/NO]: ",
                _convert_yes_no_to_bool,
                default=False,
                error_message="Please enter yes or no.",
            )
            if run_commands:
                use_command_file = _ask_field(
                    "Is this code located in a bash script? [yes/NO]: ",
                    _convert_yes_no_to_bool,
                    default=False,
                    error_message="Please enter yes or no.",
                )
                if use_command_file:
                    tpu_command_file = _ask_field(
                        "What is the path to your bash script? ",
                        default=None,
                        error_message="Please enter the path to your bash script.",
                    )
                    tpu_command_file = os.path.abspath(tpu_command_file)
                else:
                    print("Please enter each command separately you wish to run on startup in each pod.")
                    tpu_commands = []
                    another_command = True
                    while another_command:
                        tpu_commands.append(
                            _ask_field(
                                "Please enter a single command to be ran ",
                                default=None,
                                error_message="Please enter the commands you wish to run on startup in each pod as a single string.",
                            )
                        )
                        another_command = _ask_field(
                            "Do you wish to add another command? [yes/NO]: ",
                            _convert_yes_no_to_bool,
                            default=False,
                            error_message="Please enter yes or no.",
                        )
            tpu_vm = _ask_field(
                "If not using an instance group, what are the names of the Compute VM instances to be used, separated by a comma: ",
                default="",
            ).split(",")
            tpu_env = _ask_field(
                "What environment variables do you wish to set in each pod, separated by a comma: ",
                default="",
            ).split(",")

    else:
        main_training_function = "main"
        if distributed_type == DistributedType.DEEPSPEED and use_deepspeed_config:
            mixed_precision = None
        else:
            mixed_precision = _ask_options(
                "Do you wish to use mixed precision?",
                ["no", "fp16", "bf16", "fp8"],
                _convert_mixed_precision,
            )
            if mixed_precision == "fp8":
                if not is_fp8_available():
                    raise ValueError(
                        "FP8 (either torchao, Transformer Engine or MSAMP) is not installed on this machine."
                    )
                fp8_config = {}
                fp8_config["backend"] = _ask_options(
                    "Which FP8 backend do you want to use?",
                    ["ao", "te", "msamp"],
                    _convert_fp8_backend,
                )
                if fp8_config["backend"] == "TE":
                    if not is_transformer_engine_available():
                        raise ValueError("TransformersEngine was selected, but it is not installed on this machine.")
                    fp8_config["use_autocast_during_eval"] = _ask_field(
                        "Do you want to use FP8 autocast during eval mode? Generally better metrics are found when this is disabled [yes/NO]: ",
                        _convert_yes_no_to_bool,
                        default=False,
                    )
                    fp8_config["margin"] = _ask_field(
                        "What margin should be used for gradient scaling? [0]: ",
                        int,
                        default=0,
                    )
                    fp8_config["interval"] = _ask_field(
                        "What interval should be used for for how often the scaling factor is recomputed? [1]: ",
                        int,
                        default=1,
                    )
                    fp8_config["fp8_format"] = _ask_options(
                        "Which weight format should be used?",
                        ["HYBRID", "E4M3", "E5M2"],
                        lambda i: ["HYBRID", "E4M3", "E5M2"][i],
                        default=0,
                    )
                    fp8_config["amax_history_length"] = _ask_field(
                        "What length of history should be used for the amax scaling factor computation? [1024]: ",
                        int,
                        default=1024,
                    )
                    fp8_config["amax_compute_algorithm"] = _ask_options(
                        "Which algorithm should be used for the amax scaling factor computation?",
                        ["max", "most_recent"],
                        lambda x: "max" if x == 0 else "most_recent",
                        default=0,
                    )
                    fp8_config["override_linear_precision"] = _ask_field(
                        "Do you want to to execute `fprop`, `dgrad`, and `wgrad` GEMMS in higher precision? [yes/NO]: ",
                        _convert_yes_no_to_bool,
                        default=False,
                    )
                    if fp8_config["override_linear_precision"]:
                        fprop = _ask_field(
                            "Should `fprop` be executed in higher precision? [yes/NO]: ",
                            _convert_yes_no_to_bool,
                            default=False,
                        )
                        dgrad = _ask_field(
                            "Should `dgrad` be executed in higher precision? [yes/NO]: ",
                            _convert_yes_no_to_bool,
                            default=False,
                        )
                        wgrad = _ask_field(
                            "Should `wgrad` be executed in higher precision? [yes/NO]: ",
                            _convert_yes_no_to_bool,
                            default=False,
                        )
                        fp8_config["override_linear_precision"] = (fprop, dgrad, wgrad)
                    else:
                        fp8_config["override_linear_precision"] = (False, False, False)

                elif fp8_config["backend"] == "MSAMP":
                    if not is_msamp_available():
                        raise ValueError("MSAMP was selected, but it is not installed on this machine.")
                    fp8_config["optimization_level"] = _ask_options(
                        "Which optimization level should be used?",
                        ["O1", "O2"],
                        lambda x: "O1" if x == 0 else "O2",
                        default=1,
                    )

                elif fp8_config["backend"] == "AO":
                    if not is_torchao_available():
                        raise ValueError("torchao was selected, but it is not installed on this machine.")
                    fp8_config["enable_fsdp_float8_all_gather"] = _ask_field(
                        "Do you want to enable FSDP2 float8 all gather? This is recommended for better performance if using FSDP2. [YES/no]: ",
                        _convert_yes_no_to_bool,
                        default=True,
                    )
                    fp8_config["pad_inner_dim"] = _ask_field(
                        "Do you want to pad the inner dimension of weight matrices before float8 matmuls? This is required for _scaled_mm which has strict alignment requirements. Note: padding may cause memory spikes. [YES/no]: ",
                        _convert_yes_no_to_bool,
                        default=True,
                    )

    if use_dynamo and mixed_precision == "no" and not use_cpu:
        print(
            "Torch dynamo used without mixed precision requires TF32 to be efficient. Accelerate will enable it by default when launching your scripts."
        )

    if distributed_type == DistributedType.XLA and mixed_precision == "bf16":
        tpu_downcast_bf16 = _ask_field(
            "Should `torch.float` be cast as `bfloat16` and `torch.double` remain `float32` on TPUs?", default="no"
        )

    return ClusterConfig(
        compute_environment=ComputeEnvironment.LOCAL_MACHINE,
        distributed_type=distributed_type,
        num_processes=num_processes,
        gpu_ids=gpu_ids,
        mixed_precision=mixed_precision,
        downcast_bf16=tpu_downcast_bf16,
        machine_rank=machine_rank,
        num_machines=num_machines,
        main_process_ip=main_process_ip,
        main_process_port=main_process_port,
        main_training_function=main_training_function,
        fp8_config=fp8_config,
        deepspeed_config=deepspeed_config,
        fsdp_config=fsdp_config,
        parallelism_config=parallelism_config,
        megatron_lm_config=megatron_lm_config,
        mpirun_config=mpirun_config,
        use_cpu=use_cpu,
        rdzv_backend=rdzv_backend,
        same_network=same_network,
        commands=tpu_commands,
        command_file=tpu_command_file,
        tpu_env=tpu_env,
        tpu_name=tpu_name,
        tpu_vm=tpu_vm,
        tpu_zone=tpu_zone,
        tpu_use_sudo=tpu_use_sudo,
        tpu_use_cluster=tpu_use_cluster,
        dynamo_config=dynamo_config,
        debug=debug,
        enable_cpu_affinity=enable_cpu_affinity,
    )


================================================
FILE: src/accelerate/commands/config/config.py
================================================
#!/usr/bin/env python

# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import os

from accelerate.utils import ComputeEnvironment

from .cluster import get_cluster_input
from .config_args import cache_dir, default_config_file, default_yaml_config_file, load_config_from_file  # noqa: F401
from .config_utils import _ask_field, _ask_options, _convert_compute_environment  # noqa: F401
from .sagemaker import get_sagemaker_input


description = "Launches a series of prompts to create and save a `default_config.yaml` configuration file for your training system. Should always be ran first on your machine"


def get_user_input():
    compute_environment = _ask_options(
        "In which compute environment are you running?",
        ["This machine", "AWS (Amazon SageMaker)"],
        _convert_compute_environment,
    )
    if compute_environment == ComputeEnvironment.AMAZON_SAGEMAKER:
        config = get_sagemaker_input()
    else:
        config = get_cluster_input()
    return config


def config_command_parser(subparsers=None):
    if subparsers is not None:
        parser = subparsers.add_parser("config", description=description)
    else:
        parser = argparse.ArgumentParser("Accelerate config command", description=description)

    parser.add_argument(
        "--config_file",
        default=None,
        help=(
            "The path to use to store the config file. Will default to a file named default_config.yaml in the cache "
            "location, which is the content of the environment `HF_HOME` suffixed with 'accelerate', or if you don't have "
            "such an environment variable, your cache directory ('~/.cache' or the content of `XDG_CACHE_HOME`) suffixed "
            "with 'huggingface'."
        ),
    )

    if subparsers is not None:
        parser.set_defaults(func=config_command)
    return parser


def config_command(args):
    config = get_user_input()
    if args.config_file is not None:
        config_file = args.config_file
    else:
        if not os.path.isdir(cache_dir):
            os.makedirs(cache_dir)
        config_file = default_yaml_config_file

    if config_file.endswith(".json"):
        config.to_json_file(config_file)
    else:
        config.to_yaml_file(config_file)
    print(f"accelerate configuration saved at {config_file}")


def main():
    parser = config_command_parser()
    args = parser.parse_args()
    config_command(args)


if __name__ == "__main__":
    main()


================================================
FILE: src/accelerate/commands/config/config_args.py
================================================
#!/usr/bin/env python

# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import os
from dataclasses import dataclass
from enum import Enum
from typing import Optional, Union

import yaml

from ...utils import ComputeEnvironment, DistributedType, SageMakerDistributedType
from ...utils.constants import SAGEMAKER_PYTHON_VERSION, SAGEMAKER_PYTORCH_VERSION, SAGEMAKER_TRANSFORMERS_VERSION


hf_cache_home = os.path.expanduser(
    os.environ.get("HF_HOME", os.path.join(os.environ.get("XDG_CACHE_HOME", "~/.cache"), "huggingface"))
)
cache_dir = os.path.join(hf_cache_home, "accelerate")
default_json_config_file = os.path.join(cache_dir, "default_config.yaml")
default_yaml_config_file = os.path.join(cache_dir, "default_config.yaml")

# For backward compatibility: the default config is the json one if it's the only existing file.
if os.path.isfile(default_yaml_config_file) or not os.path.isfile(default_json_config_file):
    default_config_file = default_yaml_config_file
else:
    default_config_file = default_json_config_file


def load_config_from_file(config_file):
    if config_file is not None:
        if not os.path.isfile(config_file):
            raise FileNotFoundError(
                f"The passed configuration file `{config_file}` does not exist. "
                "Please pass an existing file to `accelerate launch`, or use the default one "
                "created through `accelerate config` and run `accelerate launch` "
                "without the `--config_file` argument."
            )
    else:
        config_file = default_config_file
    with open(config_file, encoding="utf-8") as f:
        if config_file.endswith(".json"):
            if (
                json.load(f).get("compute_environment", ComputeEnvironment.LOCAL_MACHINE)
                == ComputeEnvironment.LOCAL_MACHINE
            ):
                config_class = ClusterConfig
            else:
                config_class = SageMakerConfig
            return config_class.from_json_file(json_file=config_file)
        else:
            if (
                yaml.safe_load(f).get("compute_environment", ComputeEnvironment.LOCAL_MACHINE)
                == ComputeEnvironment.LOCAL_MACHINE
            ):
                config_class = ClusterConfig
            else:
                config_class = SageMakerConfig
            return config_class.from_yaml_file(yaml_file=config_file)


@dataclass
class BaseConfig:
    compute_environment: ComputeEnvironment
    distributed_type: Union[DistributedType, SageMakerDistributedType]
    mixed_precision: str
    use_cpu: bool
    debug: bool

    def to_dict(self):
        result = self.__dict__
        # For serialization, it's best to convert Enums to strings (or their underlying value type).

        def _convert_enums(value):
            if isinstance(value, Enum):
                return value.value
            if isinstance(value, dict):
                if not bool(value):
                    return None
                for key1, value1 in value.items():
                    value[key1] = _convert_enums(value1)
            return value

        for key, value in result.items():
            result[key] = _convert_enums(value)
        result = {k: v for k, v in result.items() if v is not None}
        return result

    @staticmethod
    def process_config(config_dict):
        """
        Processes `config_dict` and sets default values for any missing keys
        """
        if "compute_environment" not in config_dict:
            config_dict["compute_environment"] = ComputeEnvironment.LOCAL_MACHINE
        if "distributed_type" not in config_dict:
            raise ValueError("A `distributed_type` must be specified in the config file.")
        if "num_processes" not in config_dict and config_dict["distributed_type"] == DistributedType.NO:
            config_dict["num_processes"] = 1
        if "mixed_precision" not in config_dict:
            config_dict["mixed_precision"] = "fp16" if ("fp16" in config_dict and config_dict["fp16"]) else None
        if "fp16" in config_dict:  # Convert the config to the new format.
            del config_dict["fp16"]
        if "dynamo_backend" in config_dict:  # Convert the config to the new format.
            dynamo_backend = config_dict.pop("dynamo_backend")
            config_dict["dynamo_config"] = {} if dynamo_backend == "NO" else {"dynamo_backend": dynamo_backend}
        if "use_cpu" not in config_dict:
            config_dict["use_cpu"] = False
        if "debug" not in config_dict:
            config_dict["debug"] = False
        if "enable_cpu_affinity" not in config_dict:
            config_dict["enable_cpu_affinity"] = False
        return config_dict

    @classmethod
    def from_json_file(cls, json_file=None):
        json_file = default_json_config_file if json_file is None else json_file
        with open(json_file, encoding="utf-8") as f:
            config_dict = json.load(f)
        config_dict = cls.process_config(config_dict)
        extra_keys = sorted(set(config_dict.keys()) - set(cls.__dataclass_fields__.keys()))
        if len(extra_keys) > 0:
            raise ValueError(
                f"The config file at {json_file} had unknown keys ({extra_keys}), please try upgrading your `accelerate`"
                " version or fix (and potentially remove) these keys from your config file."
            )

        return cls(**config_dict)

    def to_json_file(self, json_file):
        with open(json_file, "w", encoding="utf-8") as f:
            content = json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
            f.write(content)

    @classmethod
    def from_yaml_file(cls, yaml_file=None):
        yaml_file = default_yaml_config_file if yaml_file is None else yaml_file
        with open(yaml_file, encoding="utf-8") as f:
            config_dict = yaml.safe_load(f)
        config_dict = cls.process_config(config_dict)
        extra_keys = sorted(set(config_dict.keys()) - set(cls.__dataclass_fields__.keys()))
        if len(extra_keys) > 0:
            raise ValueError(
                f"The config file at {yaml_file} had unknown keys ({extra_keys}), please try upgrading your `accelerate`"
                " version or fix (and potentially remove) these keys from your config file."
            )
        return cls(**config_dict)

    def to_yaml_file(self, yaml_file):
        with open(yaml_file, "w", encoding="utf-8") as f:
            yaml.safe_dump(self.to_dict(), f)

    def __post_init__(self):
        if isinstance(self.compute_environment, str):
            self.compute_environment = ComputeEnvironment(self.compute_environment)
        if isinstance(self.distributed_type, str):
            if self.compute_environment == ComputeEnvironment.AMAZON_SAGEMAKER:
                self.distributed_type = SageMakerDistributedType(self.distributed_type)
            else:
                self.distributed_type = DistributedType(self.distributed_type)
        if getattr(self, "dynamo_config", None) is None:
            self.dynamo_config = {}


@dataclass
class ClusterConfig(BaseConfig):
    num_processes: int = -1  # For instance if we use SLURM and the user manually passes it in
    machine_rank: int = 0
    num_machines: int = 1
    gpu_ids: Optional[str] = None
    main_process_ip: Optional[str] = None
    main_process_port: Optional[int] = None
    rdzv_backend: Optional[str] = "static"
    same_network: Optional[bool] = False
    main_training_function: str = "main"
    enable_cpu_affinity: bool = False

    # args for FP8 training
    fp8_config: Optional[dict] = None
    # args for deepspeed_plugin
    deepspeed_config: Optional[dict] = None
    # args for fsdp
    fsdp_config: Optional[dict] = None
    # args for parallelism config
    parallelism_config: Optional[dict] = None
    # args for megatron_lm
    megatron_lm_config: Optional[dict] = None
    # args for mpirun
    mpirun_config: Optional[dict] = None
    # args for TPU
    downcast_bf16: bool = False

    # args for TPU pods
    tpu_name: Optional[str] = None
    tpu_zone: Optional[str] = None
    tpu_use_cluster: bool = False
    tpu_use_sudo: bool = False
    command_file: Optional[str] = None
    commands: list[str] = None
    tpu_vm: list[str] = None
    tpu_env: list[str] = None

    # args for dynamo
    dynamo_config: Optional[dict] = None

    def __post_init__(self):
        if self.deepspeed_config is None:
            self.deepspeed_config = {}
        if self.fsdp_config is None:
            self.fsdp_config = {}
        if self.megatron_lm_config is None:
            self.megatron_lm_config = {}
        if self.mpirun_config is None:
            self.mpirun_config = {}
        if self.fp8_config is None:
            self.fp8_config = {}
        if self.parallelism_config is None:
            self.parallelism_config = {}
        return super().__post_init__()


@dataclass
class SageMakerConfig(BaseConfig):
    ec2_instance_type: str
    iam_role_name: str
    image_uri: Optional[str] = None
    profile: Optional[str] = None
    region: str = "us-east-1"
    num_machines: int = 1
    gpu_ids: str = "all"
    base_job_name: str = f"accelerate-sagemaker-{num_machines}"
    pytorch_version: str = SAGEMAKER_PYTORCH_VERSION
    transformers_version: str = SAGEMAKER_TRANSFORMERS_VERSION
    py_version: str = SAGEMAKER_PYTHON_VERSION
    sagemaker_inputs_file: Optional[str] = None
    sagemaker_metrics_file: Optional[str] = None
    additional_args: Optional[dict] = None
    dynamo_config: Optional[dict] = None
    enable_cpu_affinity: bool = False


================================================
FILE: src/accelerate/commands/config/config_utils.py
================================================
#!/usr/bin/env python

# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse

from ...utils.dataclasses import (
    ComputeEnvironment,
    DistributedType,
    DynamoBackend,
    FP8BackendType,
    PrecisionType,
    SageMakerDistributedType,
)
from ..menu import BulletMenu


DYNAMO_BACKENDS = [
    "EAGER",
    "AOT_EAGER",
    "INDUCTOR",
    "AOT_TS_NVFUSER",
    "NVPRIMS_NVFUSER",
    "CUDAGRAPHS",
    "OFI",
    "FX2TRT",
    "ONNXRT",
    "TENSORRT",
    "AOT_TORCHXLA_TRACE_ONCE",
    "TORHCHXLA_TRACE_ONCE",
    "TVM",
]


def _ask_field(input_text, convert_value=None, default=None, error_message=None):
    ask_again = True
    while ask_again:
        result = input(input_text)
        try:
            if default is not None and len(result) == 0:
                return default
            return convert_value(result) if convert_value is not None else result
        except Exception:
            if error_message is not None:
                print(error_message)


def _ask_options(input_text, options=[], convert_value=None, default=0):
    menu = BulletMenu(input_text, options)
    result = menu.run(default_choice=default)
    return convert_value(result) if convert_value is not None else result


def _convert_compute_environment(value):
    value = int(value)
    return ComputeEnvironment(["LOCAL_MACHINE", "AMAZON_SAGEMAKER"][value])


def _convert_distributed_mode(value):
    value = int(value)
    return DistributedType(
        [
            "NO",
            "MULTI_CPU",
            "MULTI_XPU",
            "MULTI_HPU",
            "MULTI_GPU",
            "MULTI_NPU",
            "MULTI_MLU",
            "MULTI_SDAA",
            "MULTI_MUSA",
            "MULTI_NEURON",
            "XLA",
        ][value]
    )


def _convert_dynamo_backend(value):
    value = int(value)
    return DynamoBackend(DYNAMO_BACKENDS[value]).value


def _convert_mixed_precision(value):
    value = int(value)
    return PrecisionType(["no", "fp16", "bf16", "fp8"][value])


def _convert_sagemaker_distributed_mode(value):
    value = int(value)
    return SageMakerDistributedType(["NO", "DATA_PARALLEL", "MODEL_PARALLEL"][value])


def _convert_fp8_backend(value):
    value = int(value)
    return FP8BackendType(["AO", "TE", "MSAMP"][value])


def _convert_yes_no_to_bool(value):
    return {"yes": True, "no": False}[value.lower()]


class SubcommandHelpFormatter(argparse.RawDescriptionHelpFormatter):
    """
    A custom formatter that will remove the usage line from the help message for subcommands.
    """

    def _format_usage(self, usage, actions, groups, prefix):
        usage = super()._format_usage(usage, actions, groups, prefix)
        usage = usage.replace("<command> [<args>] ", "")
        return usage


================================================
FILE: src/accelerate/commands/config/default.py
================================================
#!/usr/bin/env python

# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from pathlib import Path

import torch

from ...utils import (
    is_hpu_available,
    is_mlu_available,
    is_musa_available,
    is_neuron_available,
    is_npu_available,
    is_sdaa_available,
    is_xpu_available,
)
from .config_args import ClusterConfig, default_json_config_file
from .config_utils import SubcommandHelpFormatter


description = "Create a default config file for Accelerate with only a few flags set."


def write_basic_config(mixed_precision="no", save_location: str = default_json_config_file):
    """
    Creates and saves a basic cluster config to be used on a local machine with potentially multiple GPUs. Will also
    set CPU if it is a CPU-only machine.

    Args:
        mixed_precision (`str`, *optional*, defaults to "no"):
            Mixed Precision to use. Should be one of "no", "fp16", or "bf16"
        save_location (`str`, *optional*, defaults to `default_json_config_file`):
            Optional custom save location. Should be passed to `--config_file` when using `accelerate launch`. Default
            location is inside the huggingface cache folder (`~/.cache/huggingface`) but can be overridden by setting
            the `HF_HOME` environmental variable, followed by `accelerate/default_config.yaml`.
    """
    path = Path(save_location)
    path.parent.mkdir(parents=True, exist_ok=True)
    if path.exists():
        print(
            f"Configuration already exists at {save_location}, will not override. Run `accelerate config` manually or pass a different `save_location`."
        )
        return False
    mixed_precision = mixed_precision.lower()
    if mixed_precision not in ["no", "fp16", "bf16", "fp8"]:
        raise ValueError(
            f"`mixed_precision` should be one of 'no', 'fp16', 'bf16', or 'fp8'. Received {mixed_precision}"
        )
    config = {
        "compute_environment": "LOCAL_MACHINE",
        "mixed_precision": mixed_precision,
    }
    if is_mlu_available():
        num_mlus = torch.mlu.device_count()
        config["num_processes"] = num_mlus
        config["use_cpu"] = False
        if num_mlus > 1:
            config["distributed_type"] = "MULTI_MLU"
        else:
            config["distributed_type"] = "NO"
    if is_sdaa_available():
        num_sdaas = torch.sdaa.device_count()
        config["num_processes"] = num_sdaas
        config["use_cpu"] = False
        if num_sdaas > 1:
            config["distributed_type"] = "MULTI_SDAA"
        else:
            config["distributed_type"] = "NO"
    elif is_musa_available():
        num_musas = torch.musa.device_count()
        config["num_processes"] = num_musas
        config["use_cpu"] = False
        if num_musas > 1:
            config["distributed_type"] = "MULTI_MUSA"
        else:
            config["distributed_type"] = "NO"
    elif is_hpu_available():
        num_hpus = torch.hpu.device_count()
        config["num_processes"] = num_hpus
        config["use_cpu"] = False
        if num_hpus > 1:
            config["distributed_type"] = "MULTI_HPU"
        else:
            config["distributed_type"] = "NO"
    elif torch.cuda.is_available():
        num_gpus = torch.cuda.device_count()
        config["num_processes"] = num_gpus
        config["use_cpu"] = False
        if num_gpus > 1:
            config["distributed_type"] = "MULTI_GPU"
        else:
            config["distributed_type"] = "NO"
    elif is_xpu_available():
        num_xpus = torch.xpu.device_count()
        config["num_processes"] = num_xpus
        config["use_cpu"] = False
        if num_xpus > 1:
            config["distributed_type"] = "MULTI_XPU"
        else:
            config["distributed_type"] = "NO"
    elif is_npu_available():
        num_npus = torch.npu.device_count()
        config["num_processes"] = num_npus
        config["use_cpu"] = False
        if num_npus > 1:
            config["distributed_type"] = "MULTI_NPU"
        else:
            config["distributed_type"] = "NO"
    elif is_neuron_available():
        num_neuron_cores = torch.neuron.device_count()
        config["num_processes"] = num_neuron_cores
        config["use_cpu"] = False
        if num_neuron_cores > 1:
            config["distributed_type"] = "MULTI_NEURON"
        else:
            config["distributed_type"] = "NO"
    else:
        num_xpus = 0
        config["use_cpu"] = True
        config["num_processes"] = 1
        config["distributed_type"] = "NO"
    config["debug"] = False
    config["enable_cpu_affinity"] = False
    config = ClusterConfig(**config)
    config.to_json_file(path)
    return path


def default_command_parser(parser, parents):
    parser = parser.add_parser("default", parents=parents, help=description, formatter_class=SubcommandHelpFormatter)
    parser.add_argument(
        "--config_file",
        default=default_json_config_file,
        help=(
            "The path to use to store the config file. Will default to a file named default_config.yaml in the cache "
            "location, which is the content of the environment `HF_HOME` suffixed with 'accelerate', or if you don't have "
            "such an environment variable, your cache directory ('~/.cache' or the content of `XDG_CACHE_HOME`) suffixed "
            "with 'huggingface'."
        ),
        dest="save_location",
    )

    parser.add_argument(
        "--mixed_precision",
        choices=["no", "fp16", "bf16"],
        type=str,
        help="Whether or not to use mixed precision training. "
        "Choose between FP16 and BF16 (bfloat16) training. "
        "BF16 training is only supported on Nvidia Ampere GPUs and PyTorch 1.10 or later.",
        default="no",
    )
    parser.set_defaults(func=default_config_command)
    return parser


def default_config_command(args):
    config_file = write_basic_config(args.mixed_precision, args.save_location)
    if config_file:
        print(f"accelerate configuration saved at {config_file}")


================================================
FILE: src/accelerate/commands/config/sagemaker.py
================================================
#!/usr/bin/env python

# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os

from ...utils.constants import SAGEMAKER_PARALLEL_EC2_INSTANCES, TORCH_DYNAMO_MODES
from ...utils.dataclasses import ComputeEnvironment, SageMakerDistributedType
from ...utils.imports import is_boto3_available
from .config_args import SageMakerConfig
from .config_utils import (
    DYNAMO_BACKENDS,
    _ask_field,
    _ask_options,
    _convert_dynamo_backend,
    _convert_mixed_precision,
    _convert_sagemaker_distributed_mode,
    _convert_yes_no_to_bool,
)


if is_boto3_available():
    import boto3  # noqa: F401


def _create_iam_role_for_sagemaker(role_name):
    iam_client = boto3.client("iam")

    sagemaker_trust_policy = {
        "Version": "2012-10-17",
        "Statement": [
            {"Effect": "Allow", "Principal": {"Service": "sagemaker.amazonaws.com"}, "Action": "sts:AssumeRole"}
        ],
    }
    try:
        # create the role, associated with the chosen trust policy
        iam_client.create_role(
            RoleName=role_name, AssumeRolePolicyDocument=json.dumps(sagemaker_trust_policy, indent=2)
        )
        policy_document = {
            "Version": "2012-10-17",
            "Statement": [
                {
                    "Effect": "Allow",
                    "Action": [
                        "sagemaker:*",
                        "ecr:GetDownloadUrlForLayer",
                        "ecr:BatchGetImage",
                        "ecr:BatchCheckLayerAvailability",
                        "ecr:GetAuthorizationToken",
                        "cloudwatch:PutMetricData",
                        "cloudwatch:GetMetricData",
                        "cloudwatch:GetMetricStatistics",
                        "cloudwatch:ListMetrics",
                        "logs:CreateLogGroup",
                        "logs:CreateLogStream",
                        "logs:DescribeLogStreams",
                        "logs:PutLogEvents",
                        "logs:GetLogEvents",
                        "s3:CreateBucket",
                        "s3:ListBucket",
                        "s3:GetBucketLocation",
                        "s3:GetObject",
                        "s3:PutObject",
                    ],
                    "Resource": "*",
                }
            ],
        }
        # attach policy to role
        iam_client.put_role_policy(
            RoleName=role_name,
            PolicyName=f"{role_name}_policy_permission",
            PolicyDocument=json.dumps(policy_document, indent=2),
        )
    except iam_client.exceptions.EntityAlreadyExistsException:
        print(f"role {role_name} already exists. Using existing one")


def _get_iam_role_arn(role_name):
    iam_client = boto3.client("iam")
    return iam_client.get_role(RoleName=role_name)["Role"]["Arn"]


def get_sagemaker_input():
    credentials_configuration = _ask_options(
        "How do you want to authorize?",
        ["AWS Profile", "Credentials (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) "],
        int,
    )
    aws_profile = None
    if credentials_configuration == 0:
        aws_profile = _ask_field("Enter your AWS Profile name: [default] ", default="default")
        os.environ["AWS_PROFILE"] = aws_profile
    else:
        print(
            "Note you will need to provide AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY when you launch you training script with,"
            "`accelerate launch --aws_access_key_id XXX --aws_secret_access_key YYY`"
        )
        aws_access_key_id = _ask_field("AWS Access Key ID: ")
        os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id

        aws_secret_access_key = _ask_field("AWS Secret Access Key: ")
        os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key

    aws_region = _ask_field("Enter your AWS Region: [us-east-1]", default="us-east-1")
    os.environ["AWS_DEFAULT_REGION"] = aws_region

    role_management = _ask_options(
        "Do you already have an IAM Role for executing Amazon SageMaker Training Jobs?",
        ["Provide IAM Role name", "Create new IAM role using credentials"],
        int,
    )
    if role_management == 0:
        iam_role_name = _ask_field("Enter your IAM role name: ")
    else:
        iam_role_name = "accelerate_sagemaker_execution_role"
        print(f'Accelerate will create an iam role "{iam_role_name}" using the provided credentials')
        _create_iam_role_for_sagemaker(iam_role_name)

    is_custom_docker_image = _ask_field(
        "Do you want to use custom Docker image? [yes/NO]: ",
        _convert_yes_no_to_bool,
        default=False,
        error_message="Please enter yes or no.",
    )
    docker_image = None
    if is_custom_docker_image:
        docker_image = _ask_field("Enter your Docker image: ", lambda x: str(x).lower())

    is_sagemaker_inputs_enabled = _ask_field(
        "Do you want to provide SageMaker input channels with data locations? [yes/NO]: ",
        _convert_yes_no_to_bool,
        default=False,
        error_message="Please enter yes or no.",
    )
    sagemaker_inputs_file = None
    if is_sagemaker_inputs_enabled:
        sagemaker_inputs_file = _ask_field(
            "Enter the path to the SageMaker inputs TSV file with columns (channel_name, data_location): ",
            lambda x: str(x).lower(),
        )

    is_sagemaker_metrics_enabled = _ask_field(
        "Do you want to enable SageMaker metrics? [yes/NO]: ",
        _convert_yes_no_to_bool,
        default=False,
        error_message="Please enter yes or no.",
    )
    sagemaker_metrics_file = None
    if is_sagemaker_metrics_enabled:
        sagemaker_metrics_file = _ask_field(
            "Enter the path to the SageMaker metrics TSV file with columns (metric_name, metric_regex): ",
            lambda x: str(x).lower(),
        )

    distributed_type = _ask_options(
        "What is the distributed mode?",
        ["No distributed training", "Data parallelism"],
        _convert_sagemaker_distributed_mode,
    )
    dynamo_config = {}
    use_dynamo = _ask_field(
        "Do you wish to optimize your script with torch dynamo?[yes/NO]:",
        _convert_yes_no_to_bool,
        default=False,
        error_message="Please enter yes or no.",
    )
    if use_dynamo:
        prefix = "dynamo_"
        dynamo_config[prefix + "backend"] = _ask_options(
            "Which dynamo backend would you like to use?",
            [x.lower() for x in DYNAMO_BACKENDS],
            _convert_dynamo_backend,
            default=2,
        )
        use_custom_options = _ask_field(
            "Do you want to customize the defaults sent to torch.compile? [yes/NO]: ",
            _convert_yes_no_to_bool,
            default=False,
            error_message="Please enter yes or no.",
        )

        if use_custom_options:
            dynamo_config[prefix + "mode"] = _ask_options(
                "Which mode do you want to use?",
                TORCH_DYNAMO_MODES,
                lambda x: TORCH_DYNAMO_MODES[int(x)],
                default="default",
            )
            dynamo_config[prefix + "use_fullgraph"] = _ask_field(
                "Do you want the fullgraph mode or it is ok to break model into several subgraphs? [yes/NO]: ",
                _convert_yes_no_to_bool,
                default=False,
                error_message="Please enter yes or no.",
            )
            dynamo_config[prefix + "use_dynamic"] = _ask_field(
                "Do you want to enable dynamic shape tracing? [yes/NO]: ",
                _convert_yes_no_to_bool,
                default=False,
                error_message="Please enter yes or no.",
            )
            dynamo_config[prefix + "use_regional_compilation"] = _ask_field(
                "Do you want to enable regional compilation? [yes/NO]: ",
                _convert_yes_no_to_bool,
                default=False,
                error_message="Please enter yes or no.",
            )

    ec2_instance_query = "Which EC2 instance type you want to use for your training?"
    if distributed_type != SageMakerDistributedType.NO:
        ec2_instance_type = _ask_options(
            ec2_instance_query, SAGEMAKER_PARALLEL_EC2_INSTANCES, lambda x: SAGEMAKER_PARALLEL_EC2_INSTANCES[int(x)]
        )
    else:
        ec2_instance_query += "? [ml.p3.2xlarge]:"
        ec2_instance_type = _ask_field(ec2_instance_query, lambda x: str(x).lower(), default="ml.p3.2xlarge")

    debug = False
    if distributed_type != SageMakerDistributedType.NO:
        debug = _ask_field(
            "Should distributed operations be checked while running for errors? This can avoid timeout issues but will be slower. [yes/NO]: ",
            _convert_yes_no_to_bool,
            default=False,
            error_message="Please enter yes or no.",
        )

    num_machines = 1
    if distributed_type in (SageMakerDistributedType.DATA_PARALLEL, SageMakerDistributedType.MODEL_PARALLEL):
        num_machines = _ask_field(
            "How many machines do you want use? [1]: ",
            int,
            default=1,
        )

    mixed_precision = _ask_options(
        "Do you wish to use FP16 or BF16 (mixed precision)?",
        ["no", "fp16", "bf16", "fp8"],
        _convert_mixed_precision,
    )

    if use_dynamo and mixed_precision == "no":
        print(
            "Torch dynamo used without mixed precision requires TF32 to be efficient. Accelerate will enable it by default when launching your scripts."
        )

    return SageMakerConfig(
        image_uri=docker_image,
        compute_environment=ComputeEnvironment.AMAZON_SAGEMAKER,
        distributed_type=distributed_type,
        use_cpu=False,
        dynamo_config=dynamo_config,
        ec2_instance_type=ec2_instance_type,
        profile=aws_profile,
        region=aws_region,
        iam_role_name=iam_role_name,
        mixed_precision=mixed_precision,
        num_machines=num_machines,
        sagemaker_inputs_file=sagemaker_inputs_file,
        sagemaker_metrics_file=sagemaker_metrics_file,
        debug=debug,
    )


================================================
FILE: src/accelerate/commands/config/update.py
================================================
#!/usr/bin/env python

# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from pathlib import Path

from .config_args import default_config_file, load_config_from_file
from .config_utils import SubcommandHelpFormatter


description = "Update an existing config file with the latest defaults while maintaining the old configuration."


def update_config(args):
    """
    Update an existing config file with the latest defaults while maintaining the old configuration.
    """
    config_file = args.config_file
    if config_file is None and Path(default_config_file).exists():
        config_file = default_config_file
    elif not Path(config_file).exists():
        raise ValueError(f"The passed config file located at {config_file} doesn't exist.")
    config = load_config_from_file(config_file)

    if config_file.endswith(".json"):
        config.to_json_file(config_file)
    else:
        config.to_yaml_file(config_file)
    return config_file


def update_command_parser(parser, parents):
    parser = parser.add_parser("update", parents=parents, help=description, formatter_class=SubcommandHelpFormatter)
    parser.add_argument(
        "--config_file",
        default=None,
        help=(
            "The path to the config file to update. Will default to a file named default_config.yaml in the cache "
            "location, which is the content of the environment `HF_HOME` suffixed with 'accelerate', or if you don't have "
            "such an environment variable, your cache directory ('~/.cache' or the content of `XDG_CACHE_HOME`) suffixed "
            "with 'huggingface'."
        ),
    )

    parser.set_defaults(func=update_config_command)
    return parser


def update_config_command(args):
    config_file = update_config(args)
    print(f"Successfully updated the configuration file at {config_file}.")


================================================
FILE: src/accelerate/commands/env.py
================================================
#!/usr/bin/env python

# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import os
import platform
import subprocess

import numpy as np
import psutil
import torch

from accelerate import __version__ as version
from accelerate.commands.config import default_config_file, load_config_from_file

from ..utils import (
    is_mlu_available,
    is_musa_available,
    is_neuron_available,
    is_npu_available,
    is_sdaa_available,
    is_xpu_available,
)


def env_command_parser(subparsers=None):
    if subparsers is not None:
        parser = subparsers.add_parser("env")
    else:
        parser = argparse.ArgumentParser("Accelerate env command")

    parser.add_argument(
        "--config_file", default=None, help="The config file to use for the default values in the launching script."
    )

    if subparsers is not None:
        parser.set_defaults(func=env_command)
    return parser


def env_command(args):
    pt_version = torch.__version__
    pt_cuda_available = torch.cuda.is_available()
    pt_xpu_available = is_xpu_available()
    pt_mlu_available = is_mlu_available()
    pt_sdaa_available = is_sdaa_available()
    pt_musa_available = is_musa_available()
    pt_npu_available = is_npu_available()
    pt_neuron_available = is_neuron_available()

    accelerator = "N/A"
    if pt_cuda_available:
        accelerator = "CUDA"
    elif pt_xpu_available:
        accelerator = "XPU"
    elif pt_mlu_available:
        accelerator = "MLU"
    elif pt_sdaa_available:
        accelerator = "SDAA"
    elif pt_musa_available:
        accelerator = "MUSA"
    elif pt_npu_available:
        accelerator = "NPU"
    elif pt_neuron_available:
        accelerator = "NEURON"

    accelerate_config = "Not found"
    # Get the default from the config file.
    if args.config_file is not None or os.path.isfile(default_config_file):
        accelerate_config = load_config_from_file(args.config_file).to_dict()

    # if we can run which, get it
    command = None
    bash_location = "Not found"
    if os.name == "nt":
        command = ["where", "accelerate"]
    elif os.name == "posix":
        command = ["which", "accelerate"]
    if command is not None:
        bash_location = subprocess.check_output(command, text=True, stderr=subprocess.STDOUT).strip()
    info = {
        "`Accelerate` version": version,
        "Platform": platform.platform(),
        "`accelerate` bash location": bash_location,
        "Python version": platform.python_version(),
        "Numpy version": np.__version__,
        "PyTorch version": f"{pt_version}",
        "PyTorch accelerator": accelerator,
        "System RAM": f"{psutil.virtual_memory().total / 1024**3:.2f} GB",
    }
    if pt_cuda_available:
        info["GPU type"] = torch.cuda.get_device_name()
    elif pt_xpu_available:
        info["XPU type"] = torch.xpu.get_device_name()
    elif pt_mlu_available:
        info["MLU type"] = torch.mlu.get_device_name()
    elif pt_sdaa_available:
        info["SDAA type"] = torch.sdaa.get_device_name()
    elif pt_musa_available:
        info["MUSA type"] = torch.musa.get_device_name()
    elif pt_neuron_available:
        info["NEURON type"] = torch.neuron.get_device_name()
    elif pt_npu_available:
        info["CANN version"] = torch.version.cann

    print("\nCopy-and-paste the text below in your GitHub issue\n")
    print("\n".join([f"- {prop}: {val}" for prop, val in info.items()]))

    print("- `Accelerate` default config:" if args.config_file is None else "- `Accelerate` config passed:")
    accelerate_config_str = (
        "\n".join([f"\t- {prop}: {val}" for prop, val in accelerate_config.items()])
        if isinstance(accelerate_config, dict)
        else f"\t{accelerate_config}"
    )
    print(accelerate_config_str)

    info["`Accelerate` configs"] = accelerate_config

    return info


def main() -> int:
    parser = env_command_parser()
    args = parser.parse_args()
    env_command(args)
    return 0


if __name__ == "__main__":
    raise SystemExit(main())


================================================
FILE: src/accelerate/commands/estimate.py
================================================
#!/usr/bin/env python

# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Optional

import torch
from huggingface_hub import model_info
from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError

from accelerate import init_empty_weights
from accelerate.commands.utils import CustomArgumentParser
from accelerate.utils import (
    calculate_maximum_sizes,
    convert_bytes,
    is_timm_available,
    is_transformers_available,
)


if is_transformers_available():
    import transformers
    from transformers import AutoConfig, AutoModel

if is_timm_available():
    import timm


def verify_on_hub(repo: str, token: Optional[str] = None):
    "Verifies that the model is on the hub and returns the model info."
    try:
        return model_info(repo, token=token)
    except (OSError, GatedRepoError):
        return "gated"
    except RepositoryNotFoundError:
        return "repo"


def check_has_model(error):
    """
    Checks what library spawned `error` when a model is not found
    """
    if is_timm_available() and isinstance(error, RuntimeError) and "Unknown model" in error.args[0]:
        return "timm"
    elif (
        is_transformers_available()
        and isinstance(error, OSError)
        and "does not appear to have a file named" in error.args[0]
    ):
        return "transformers"
    else:
        return "unknown"


def create_empty_model(
    model_name: str, library_name: str, trust_remote_code: bool = False, access_token: Optional[str] = None
):
    """
    Creates an empty model in full precision from its parent library on the `Hub` to calculate the overall memory
    consumption.

    Args:
        model_name (`str`):
            The model name on the Hub
        library_name (`str`):
            The library the model has an integration with, such as `transformers`. Will be used if `model_name` has no
            metadata on the Hub to determine the library.
        trust_remote_code (`bool`, `optional`, defaults to `False`):
            Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
            should only be set to `True` for repositories you trust and in which you have read the code, as it will
            execute code present on the Hub on your local machine.
        access_token (`str`, `optional`, defaults to `None`):
            The access token to use to access private or gated models on the Hub. (for use on the Gradio app)

    Returns:
        `torch.nn.Module`: The torch model that has been initialized on the `meta` device.

    """
    model_info = verify_on_hub(model_name, access_token)
    # Simplified errors
    if model_info == "gated":
        raise OSError(
            f"Repo for model `{model_name}` is gated. You must be authenticated to access it. Please run `huggingface-cli login`."
        )
    elif model_info == "repo":
        raise OSError(
            f"Repo for model `{model_name}` does not exist on the Hub. If you are trying to access a private repo,"
            " make sure you are authenticated via `huggingface-cli login` and have access."
        )
    if library_name is None:
        library_name = getattr(model_info, "library_name", False)
        if not library_name:
            raise ValueError(
                f"Model `{model_name}` does not have any library metadata on the Hub, please manually pass in a `--library_name` to use (such as `transformers`)"
            )
    if library_name == "transformers":
        if not is_transformers_available():
            raise ImportError(
                f"To check `{model_name}`, `transformers` must be installed. Please install it via `pip install transformers`"
            )
        print(f"Loading pretrained config for `{model_name}` from `transformers`...")
        if model_info.config is None:
            raise RuntimeError(f"Tried to load `{model_name}` with `transformers` but it does not have any metadata.")

        auto_map = model_info.config.get("auto_map", False)
        config = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code, token=access_token)
        with init_empty_weights():
            # remote code could specify a specific `AutoModel` class in the `auto_map`
            constructor = AutoModel
            if isinstance(auto_map, dict):
                value = None
                for key in auto_map.keys():
                    if key.startswith("AutoModelFor"):
                        value = key
                        break
                if value is not None:
                    constructor = getattr(transformers, value)
            # we need to pass the dtype, otherwise it is going to use the torch_dtype that is saved in the config
            model = constructor.from_config(config, torch_dtype=torch.float32, trust_remote_code=trust_remote_code)
    elif library_name == "timm":
        if not is_timm_available():
            raise ImportError(
                f"To check `{model_name}`, `timm` must be installed. Please install it via `pip install timm`"
            )
        print(f"Loading pretrained config for `{model_name}` from `timm`...")
        with init_empty_weights():
            model = timm.create_model(model_name, pretrained=False)
    else:
        raise ValueError(
            f"Library `{library_name}` is not supported yet, please open an issue on GitHub for us to add support."
        )
    return model


def create_ascii_table(headers: list, rows: list, title: str):
    "Creates a pretty table from a list of rows, minimal version of `tabulate`."
    sep_char, in_between = "│", "─"
    column_widths = []
    for i in range(len(headers)):
        column_values = [row[i] for row in rows] + [headers[i]]
        max_column_width = max(len(value) for value in column_values)
        column_widths.append(max_column_width)

    formats = [f"%{column_widths[i]}s" for i in range(len(rows[0]))]

    pattern = f"{sep_char}{sep_char.join(formats)}{sep_char}"
    diff = 0

    def make_row(left_char, middle_char, right_char):
        return f"{left_char}{middle_char.join([in_between * n for n in column_widths])}{in_between * diff}{right_char}"

    separator = make_row("├", "┼", "┤")
    if len(title) > sum(column_widths):
        diff = abs(len(title) - len(separator))
        column_widths[-1] += diff

    # Update with diff
    separator = make_row("├", "┼", "┤")
    initial_rows = [
        make_row("┌", in_between, "┐"),
        f"{sep_char}{title.center(len(separator) - 2)}{sep_char}",
        make_row("├", "┬", "┤"),
    ]
    table = "\n".join(initial_rows) + "\n"
    column_widths[-1] += diff
    centered_line = [text.center(column_widths[i]) for i, text in enumerate(headers)]
    table += f"{pattern % tuple(centered_line)}\n{separator}\n"
    for i, line in enumerate(rows):
        centered_line = [t.center(column_widths[i]) for i, t in enumerate(line)]
        table += f"{pattern % tuple(centered_line)}\n"
    table += f"└{'┴'.join([in_between * n for n in column_widths])}┘"

    return table


def estimate_command_parser(subparsers=None):
    if subparsers is not None:
        parser = subparsers.add_parser("estimate-memory")
    else:
        parser = CustomArgumentParser(
            description="Model size estimator for fitting a model onto device(e.g. cuda, xpu) memory."
        )

    parser.add_argument("model_name", type=str, help="The model name on the Hugging Face Hub.")
    parser.add_argument(
        "--library_name",
        type=str,
        help="The library the model has an integration with, such as `transformers`, needed only if this information is not stored on the Hub.",
        choices=["timm", "transformers"],
    )
    parser.add_argument(
        "--dtypes",
        type=str,
        nargs="+",
        default=["float32", "float16", "int8", "int4"],
        help="The dtypes to use for the model, must be one (or many) of `float32`, `float16`, `int8`, and `int4`",
        choices=["float32", "float16", "int8", "int4"],
    )
    parser.add_argument(
        "--trust_remote_code",
        action="store_true",
        help="""Whether or not to allow for custom models defined on the Hub in their own modeling files. This flag
                should only be used for repositories you trust and in which you have read the code, as it will execute
                code present on the Hub on your local machine.""",
        default=False,
    )

    if subparsers is not None:
        parser.set_defaults(func=estimate_command)
    return parser


def estimate_training_usage(bytes: int, mixed_precision: str, msamp_config: Optional[str] = None) -> dict:
    """
    Given an amount of `bytes` and `mixed_precision`, calculates how much training memory is needed for a batch size of
    1.

    Args:
        bytes (`int`):
            The size of the model being trained.
        mixed_precision (`str`):
            The mixed precision that would be ran.
        msamp_config (`str`):
            The msamp config to estimate the training memory for if `mixed_precision` is set to `"fp8"`.
    """
    memory_sizes = {"model": -1, "optimizer": -1, "gradients": -1, "step": -1}
    fp32_size = bytes
    fp16_size = bytes // 2

    if mixed_precision == "float32":
        memory_sizes["model"] = fp32_size
        memory_sizes["gradients"] = fp32_size
        memory_sizes["optimizer"] = fp32_size * 2
        memory_sizes["step"] = fp32_size * 4
    elif mixed_precision in ("float16", "bfloat16") or (mixed_precision == "fp8" and msamp_config is None):
        # With native `TransformersEngine`, there is no memory savings with FP8
        # With mixed precision training, the model has weights stored
        # in FP16 and FP32
        memory_sizes["model"] = fp32_size
        # 1.5 from weight gradient + computation (GEMM)
        memory_sizes["gradients"] = fp32_size + fp16_size
        # 2x from optimizer states
        memory_sizes["optimizer"] = fp32_size * 2  # Optimizer states
        memory_sizes["step"] = memory_sizes["optimizer"]
    return memory_sizes


def gather_data(args):
    "Creates an empty model and gathers the data for the sizes"
    try:
        model = create_empty_model(
            args.model_name, library_name=args.library_name, trust_remote_code=args.trust_remote_code
        )
    except (RuntimeError, OSError) as e:
        library = check_has_model(e)
        if library != "unknown":
            raise RuntimeError(
                f"Tried to load `{args.model_name}` with `{library}` but a possible model to load was not found inside the repo."
            )
        raise e

    total_size, largest_layer = calculate_maximum_sizes(model)

    data = []

    for dtype in args.dtypes:
        dtype_total_size = total_size
        dtype_largest_layer = largest_layer[0]
        dtype_training_size = estimate_training_usage(dtype_total_size, dtype)
        if dtype == "float16":
            dtype_total_size /= 2
            dtype_largest_layer /= 2
        elif dtype == "int8":
            dtype_total_size /= 4
            dtype_largest_layer /= 4
        elif dtype == "int4":
            dtype_total_size /= 8
            dtype_largest_layer /= 8
        data.append([dtype, dtype_largest_layer, dtype_total_size, dtype_training_size])
    return data


def estimate_command(args):
    data = gather_data(args)
    for row in data:
        for i, item in enumerate(row):
            if isinstance(item, (int, float)):
                row[i] = convert_bytes(item)
            elif isinstance(item, dict):
                training_usage = max(item.values())
                row[i] = convert_bytes(training_usage) if training_usage != -1 else "N/A"

    headers = ["dtype", "Largest Layer", "Total Size", "Training using Adam"]

    title = f"Memory Usage for loading `{args.model_name}`"
    table = create_ascii_table(headers, data, title)
    print(table)


def main():
    parser = estimate_command_parser()
    args = parser.parse_args()
    estimate_command(args)


if __name__ == "__main__":
    main()


================================================
FILE: src/accelerate/commands/launch.py
================================================
#!/usr/bin/env python

# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import importlib
import logging
import os
import subprocess
import sys
from pathlib import Path

import torch

from accelerate.commands.config import default_config_file, load_config_from_file
from accelerate.commands.config.config_args import SageMakerConfig
from accelerate.commands.config.config_utils import DYNAMO_BACKENDS
from accelerate.commands.utils import CustomArgumentParser
from accelerate.state import get_int_from_env
from accelerate.utils import (
    ComputeEnvironment,
    DistributedType,
    PrepareForLaunch,
    _filter_args,
    check_cuda_p2p_ib_support,
    convert_dict_to_env_variables,
    is_bf16_available,
    is_deepspeed_available,
    is_hpu_available,
    is_mlu_available,
    is_musa_available,
    is_neuron_available,
    is_npu_available,
    is_rich_available,
    is_sagemaker_available,
    is_sdaa_available,
    is_torch_xla_available,
    is_xpu_available,
    patch_environment,
    prepare_deepspeed_cmd_env,
    prepare_multi_gpu_env,
    prepare_sagemager_args_inputs,
    prepare_simple_launcher_cmd_env,
    prepare_tpu,
    str_to_bool,
)
from accelerate.utils.constants import DEEPSPEED_MULTINODE_LAUNCHERS, TORCH_DYNAMO_MODES


if is_rich_available():
    from rich import get_console
    from rich.logging import RichHandler

    FORMAT = "%(message)s"
    logging.basicConfig(format=FORMAT, datefmt="[%X]", handlers=[RichHandler()])


logger = logging.getLogger(__name__)


options_to_group = {
    "multi_gpu": "Distributed GPUs",
    "tpu": "TPU",
    "use_deepspeed": "DeepSpeed Arguments",
    "use_fsdp": "FSDP Arguments",
    "use_megatron_lm": "Megatron-LM Arguments",
    "fp8_backend": "FP8 Arguments",
}


def clean_option(option):
    "Finds all cases of - after the first two characters and changes them to _"
    if "fp8_backend" in option:
        option = "--fp8_backend"
    if option.startswith("--"):
        return option[2:].replace("-", "_")


class CustomHelpFormatter(argparse.HelpFormatter):
    """
    This is a custom help formatter that will hide all arguments that are not used in the command line when the help is
    called. This is useful for the case where the user is using a specific platform and only wants to see the arguments
    for that platform.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.titles = [
            "Hardware Selection Arguments",
            "Resource Selection Arguments",
            "Training Paradigm Arguments",
            "positional arguments",
            "optional arguments",
        ]

    def add_argument(self, action: argparse.Action):
        if "accelerate" in sys.argv[0] and "launch" in sys.argv[1:]:
            args = sys.argv[2:]
        else:
            args = sys.argv[1:]

        if len(args) > 1:
            args = list(map(clean_option, args))
            used_platforms = [arg for arg in args if arg in options_to_group.keys()]
            used_titles = [options_to_group[o] for o in used_platforms]
            if action.container.title not in self.titles + used_titles:
                action.help = argparse.SUPPRESS
            elif action.container.title == "Hardware Selection Arguments":
                if set(action.option_strings).isdisjoint(set(args)):
                    action.help = argparse.SUPPRESS
                else:
                    action.help = action.help + " (currently selected)"
            elif action.container.title == "Training Paradigm Arguments":
                if set(action.option_strings).isdisjoint(set(args)):
                    action.help = argparse.SUPPRESS
                else:
                    action.help = action.help + " (currently selected)"

        action.option_strings = [s for s in action.option_strings if "-" not in s[2:]]
        super().add_argument(action)

    def end_section(self):
        if len(self._current_section.items) < 2:
            self._current_section.items = []
            self._current_section.heading = ""
        super().end_section()


def launch_command_parser(subparsers=None):
    description = "Launch a python script in a distributed scenario. Arguments can be passed in with either hyphens (`--num-processes=2`) or underscores (`--num_processes=2`)"
    if subparsers is not None:
        parser = subparsers.add_parser(
            "launch", description=description, add_help=False, allow_abbrev=False, formatter_class=CustomHelpFormatter
        )
    else:
        parser = CustomArgumentParser(
            "Accelerate launch command",
            description=description,
            add_help=False,
            allow_abbrev=False,
            formatter_class=CustomHelpFormatter,
        )

    parser.add_argument("-h", "--help", action="help", help="Show this help message and exit.")

    parser.add_argument(
        "--config_file",
        default=None,
        help="The config file to use for the default values in the launching script.",
    )
    parser.add_argument(
        "--quiet",
        "-q",
        action="store_true",
        help="Silence subprocess errors from the launch stack trace and only show the relevant tracebacks. (Only applicable to DeepSpeed and single-process configurations)",
    )
    # Hardware selection arguments
    hardware_args = parser.add_argument_group(
        "Hardware Selection Arguments", "Arguments for selecting the hardware to be used."
    )
    hardware_args.add_argument(
        "--cpu", default=False, action="store_true", help="Whether or not to force the training on the CPU."
    )
    hardware_args.add_argument(
        "--multi_gpu",
        default=False,
        action="store_true",
        help="Whether or not this should launch a distributed GPU training.",
    )
    hardware_args.add_argument(
        "--tpu", default=False, action="store_true", help="Whether or not this should launch a TPU training."
    )
    # Resource selection arguments
    resource_args = parser.add_argument_group(
        "Resource Selection Arguments", "Arguments for fine-tuning how available hardware should be used."
    )
    resource_args.add_argument(
        "--mixed_precision",
        type=str,
        choices=["no", "fp16", "bf16", "fp8"],
        help="Whether or not to use mixed precision training. "
        "Choose between FP16 and BF16 (bfloat16) training. "
        "BF16 training is only supported on Nvidia Ampere GPUs and PyTorch 1.10 or later.",
    )
    resource_args.add_argument(
        "--num_processes", type=int, default=None, help="The total number of processes to be launched in parallel."
    )
    resource_args.add_argument(
        "--num_machines", type=int, default=None, help="The total number of machines used in this training."
    )
    resource_args.add_argument(
        "--num_cpu_threads_per_process",
        type=int,
        default=None,
        help="The number of CPU threads per process. Can be tuned for optimal performance.",
    )
    resource_args.add_argument(
        "--enable_cpu_affinity",
        default=False,
        action="store_true",
        help="Whether or not CPU affinity and balancing should be enabled. Currently only supported on NVIDIA hardware.",
    )
    # Dynamo arguments
    resource_args.add_argument(
        "--dynamo_backend",
        type=str,
        choices=["no"] + [b.lower() for b in DYNAMO_BACKENDS],
        help="Choose a backend to optimize your training with dynamo, see more at "
        "https://github.com/pytorch/torchdynamo.",
    )
    resource_args.add_argument(
        "--dynamo_mode",
        type=str,
        default="default",
        choices=TORCH_DYNAMO_MODES,
        help="Choose a mode to optimize your training with dynamo.",
    )
    resource_args.add_argument(
        "--dynamo_use_fullgraph",
        default=False,
        action="store_true",
        help="Whether to use full graph mode for dynamo or it is ok to break model into several subgraphs",
    )
    resource_args.add_argument(
        "--dynamo_use_dynamic",
        default=False,
        action="store_true",
        help="Whether to enable dynamic shape tracing.",
    )
    resource_args.add_argument(
        "--dynamo_use_regional_compilation",
        default=False,
        action="store_true",
        help="Whether to enable regional compilation.",
    )

    # Training Paradigm arguments
    paradigm_args = parser.add_argument_group(
        "Training Paradigm Arguments", "Arguments for selecting which training paradigm to be used."
    )
    paradigm_args.add_argument(
        "--use_deepspeed",
        default=False,
        action="store_true",
        help="Whether to use deepspeed.",
    )
    paradigm_args.add_argument(
        "--use_fsdp",
        default=False,
        action="store_true",
        help="Whether to use fsdp.",
    )
    paradigm_args.add_argument(
        "--use_parallelism_config",
        default=False,
        action="store_true",
        help="Whether to use the parallelism config to configure the N-d distributed training.",
    )
    paradigm_args.add_argument(
        "--use_megatron_lm",
        default=False,
        action="store_true",
        help="Whether to use Megatron-LM.",
    )

    # distributed GPU training arguments
    distributed_args = parser.add_argument_group("Distributed GPUs", "Arguments related to distributed GPU training.")
    distributed_args.add_argument(
        "--gpu_ids",
        default=None,
        help="What GPUs (by id) should be used for training on this machine as a comma-separated list",
    )
    distributed_args.add_argument(
        "--same_network",
        default=False,
        action="store_true",
        help="Whether all machines used for multinode training exist on the same local network.",
    )
    distributed_args.add_argument(
        "--machine_rank", type=int, default=None, help="The rank of the machine on which this script is launched."
    )
    distributed_args.add_argument(
        "--main_process_ip", type=str, default=None, help="The IP address of the machine of rank 0."
    )
    distributed_args.add_argument(
        "--main_process_port",
        type=int,
        default=None,
        help="The port to use to communicate with the machine of rank 0.",
    )
    distributed_args.add_argument(
        "-t",
        "--tee",
        default="0",
        type=str,
        help="Tee std streams into a log file and also to console.",
    )
    distributed_args.add_argument(
        "--log_dir",
        type=str,
        default=None,
        help=(
            "Base directory to use for log files when using torchrun/torch.distributed.run as launcher. "
            "Use with --tee to redirect std streams info log files."
        ),
    )
    distributed_args.add_argument(
        "--role",
        type=str,
        default="default",
        help="User-defined role for the workers.",
    )
    # Rendezvous related arguments
    distributed_args.add_argument(
        "--rdzv_backend",
        type=str,
        default="static",
        help="The rendezvous method to use, such as 'static' (the default) or 'c10d'",
    )
    distributed_args.add_argument(
        "--rdzv_conf",
        type=str,
        default="",
        help="Additional rendezvous configuration (<key1>=<value1>,<key2>=<value2>,...).",
    )
    distributed_args.add_argument(
        "--max_restarts",
        type=int,
        default=0,
        help="Maximum number of worker group restarts before failing.",
    )
    distributed_args.add_argument(
        "--monitor_interval",
        type=float,
        default=0.1,
        help="Interval, in seconds, to monitor the state of workers.",
    )
    parser.add_argument(
        "-m",
        "--module",
        action="store_true",
        help="Change each process to interpret the launch script as a Python module, executing with the same behavior as 'python -m'.",
    )
    parser.add_argument(
        "--no_python",
        action="store_true",
        help="Skip prepending the training script with 'python' - just execute it directly. Useful when the script is not a Python script.",
    )

    # TPU arguments
    tpu_args = parser.add_argument_group("TPU", "Arguments related to TPU.")
    tpu_args.add_argument(
        "--tpu_cluster",
        action="store_true",
        dest="tpu_use_cluster",
        help="Whether to use a GCP TPU pod for training.",
    )
    tpu_args.add_argument(
        "--no_tpu_cluster",
        action="store_false",
        dest="tpu_use_cluster",
        help="Should not be passed explicitly, this is for internal use only.",
    )
    tpu_args.add_argument(
        "--tpu_use_sudo",
        action="store_true",
        help="Whether to use `sudo` when running the TPU training script in each pod.",
    )
    tpu_args.add_argument(
        "--vm",
        type=str,
        action="append",
        help=(
            "List of single Compute VM instance names. "
            "If not provided we assume usage of instance groups. For TPU pods."
        ),
    )
    tpu_args.add_argument(
        "--env",
        type=str,
        action="append",
        help="List of environment variables to set on the Compute VM instances. For TPU pods.",
    )
    tpu_args.add_argument(
        "--main_training_function",
        type=str,
        default=None,
        help="The name of the main function to be executed in your script (only for TPU training).",
    )
    tpu_args.add_argument(
        "--downcast_bf16",
        action="store_true",
        help="Whether when using bf16 precision on TPUs if both float and double tensors are cast to bfloat16 or if double tensors remain as float32.",
    )

    # DeepSpeed arguments
    deepspeed_args = parser.add_argument_group("DeepSpeed Arguments", "Arguments related to DeepSpeed.")
    deepspeed_args.add_argument(
        "--deepspeed_config_file",
        default=None,
        type=str,
        help="DeepSpeed config file.",
    )
    deepspeed_args.add_argument(
        "--zero_stage",
        default=None,
        type=int,
        help="DeepSpeed's ZeRO optimization stage (useful only when `use_deepspeed` flag is passed). "
        "If unspecified, will default to `2`.",
    )
    deepspeed_args.add_argument(
        "--offload_optimizer_device",
        default=None,
        type=str,
        help="Decides where (none|cpu|nvme) to offload optimizer states (useful only when `use_deepspeed` flag is passed). "
        "If unspecified, will default to 'none'.",
    )
    deepspeed_args.add_argument(
        "--offload_param_device",
        default=None,
        type=str,
        help="Decides where (none|cpu|nvme) to offload parameters (useful only when `use_deepspeed` flag is passed). "
        "If unspecified, will default to 'none'.",
    )
    deepspeed_args.add_argument(
        "--offload_optimizer_nvme_path",
        default=None,
        type=str,
        help="Decides Nvme Path to offload optimizer states (useful only when `use_deepspeed` flag is passed). "
        "If unspecified, will default to 'none'.",
    )
    deepspeed_args.add_argument(
        "--offload_param_nvme_path",
        default=None,
        type=str,
        help="Decides Nvme Path to offload parameters (useful only when `use_deepspeed` flag is passed). "
        "If unspecified, will default to 'none'.",
    )
    deepspeed_args.add_argument(
        "--gradient_accumulation_steps",
        default=None,
        type=int,
        help="No of gradient_accumulation_steps used in your training script (useful only when `use_deepspeed` flag is passed). "
        "If unspecified, will default to `1`.",
    )
    deepspeed_args.add_argument(
        "--gradient_clipping",
        default=None,
        type=float,
        help="gradient clipping value used in your training script (useful only when `use_deepspeed` flag is passed). "
        "If unspecified, will default to `1.0`.",
    )
    deepspeed_args.add_argument(
        "--zero3_init_flag",
        default=None,
        type=str,
        help="Decides Whether (true|false) to enable `deepspeed.zero.Init` for constructing massive models. "
        "Only applicable with DeepSpeed ZeRO Stage-3. If unspecified, will default to `true`.",
    )
    deepspeed_args.add_argument(
        "--zero3_save_16bit_model",
        default=None,
        type=str,
        help="Decides Whether (true|false) to save 16-bit model weights when using ZeRO Stage-3. "
        "Only applicable with DeepSpeed ZeRO Stage-3. If unspecified, will default to `false`.",
    )
    deepspeed_args.add_argument(
        "--deepspeed_hostfile",
        default=None,
        type=str,
        help="DeepSpeed hostfile for configuring multi-node compute resources.",
    )
    deepspeed_args.add_argument(
        "--deepspeed_exclusion_filter",
        default=None,
        type=str,
        help="DeepSpeed exclusion filter string when using multi-node setup.",
    )
    deepspeed_args.add_argument(
        "--deepspeed_inclusion_filter",
        default=None,
        type=str,
        help="DeepSpeed inclusion filter string when using multi-node setup.",
    )
    deepspeed_args.add_argument(
        "--deepspeed_multinode_launcher",
        default=None,
        type=str,
        help="DeepSpeed multi-node launcher to use, e.g. `pdsh`, `standard`, `openmpi`, `mvapich`, `mpich`, `slurm`, `nossh` (requires DeepSpeed >= 0.14.5). If unspecified, will default to `pdsh`.",
    )
    deepspeed_args.add_argument(
        "--deepspeed_moe_layer_cls_names",
        default=None,
        type=str,
        help="comma-separated list of transformer MoE layer class names (case-sensitive) to wrap ,e.g, `MixtralSparseMoeBlock`, `Qwen2MoeSparseMoeBlock`, `JetMoEAttention,JetMoEBlock` ..."
        " (useful only when `use_deepspeed` flag is passed).",
    )

    # fsdp arguments
    fsdp_args = parser.add_argument_group("FSDP Arguments", "Arguments related to Fully Shared Data Parallelism.")
    fsdp_args.add_argument(
        "--fsdp_version",
        type=str,
        default="1",
        choices=["1", "2"],
        help="FSDP version to use. (useful only when `use_fsdp` flag is passed).",
    )
    fsdp_args.add_argument(
        "--fsdp_offload_params",
        default="false",
        type=str,
        help="Decides Whether (true|false) to offload parameters and gradients to CPU. (useful only when `use_fsdp` flag is passed).",
    )
    fsdp_args.add_argument(
        "--fsdp_min_num_params",
        type=int,
        default=int(1e8),
        help="FSDP's minimum number of parameters for Default Auto Wrapping. (useful only when `use_fsdp` flag is passed).",
    )
    # We enable this for backwards compatibility, throw a warning if this is set in `FullyShardedDataParallelPlugin`
    fsdp_args.add_argument(
        "--fsdp_sharding_strategy",
        type=str,
        default="FULL_SHARD",
        help="FSDP's sharding strategy. (useful only when `use_fsdp` flag is passed and `fsdp_version=1`).",
    )
    fsdp_args.add_argument(
        "--fsdp_reshard_after_forward",
        type=str,
        default="true",
        help="FSDP's Reshard After Forward Strategy. (useful only when `use_fsdp` flag is passed). Supports either boolean (FSDP2) or `FULL_SHARD | SHARD_GRAD_OP | NO_RESHARD` (FSDP1).",
    )
    fsdp_args.add_argument(
        "--fsdp_auto_wrap_policy",
        type=str,
        default=None,
        help="FSDP's auto wrap policy. (useful only when `use_fsdp` flag is passed).",
    )
    fsdp_args.add_argument(
        "--fsdp_transformer_layer_cls_to_wrap",
        default=None,
        type=str,
        help="Transformer layer class name (case-sensitive) to wrap ,e.g, `BertLayer`, `GPTJBlock`, `T5Block` .... "
        "(useful only when `use_fsdp` flag is passed).",
    )
    fsdp_args.add_argument(
        "--fsdp_backward_prefetch",
        default=None,
        type=str,
        help="FSDP's backward prefetch policy. (useful only when `use_fsdp` flag is passed).",
    )
    fsdp_args.add_argument(
        "--fsdp_state_dict_type",
        default=None,
        type=str,
        help="FSDP's state dict type. (useful only when `use_fsdp` flag is passed).",
    )
    fsdp_args.add_argument(
        "--fsdp_forward_prefetch",
        default="false",
        type=str,
        help="If True, then FSDP explicitly prefetches the next upcoming "
        "all-gather while executing in the forward pass (useful only when `use_fsdp` flag is passed).",
    )
    fsdp_args.add_argument(
        "--fsdp_use_orig_params",
        default="true",
        type=str,
        help="If True, allows non-uniform `requires_grad` during init, which means support for interspersed frozen and trainable parameters."
        " (useful only when `use_fsdp` flag is passed).",
    )
    fsdp_args.add_argument(
        "--fsdp_cpu_ram_efficient_loading",
        default="true",
        type=str,
        help="If True, only the first process loads the pretrained model checkoint while all other processes have empty weights. "
        "Only applicable for 🤗 Transformers. When using this, `--fsdp_sync_module_states` needs to True. "
        "(useful only when `use_fsdp` flag is passed).",
    )
    fsdp_args.add_argument(
        "--fsdp_sync_module_states",
        default="true",
        type=str,
        help="If True, each individually wrapped FSDP unit will broadcast module parameters from rank 0."
        " (useful only when `use_fsdp` flag is passed).",
    )
    fsdp_args.add_argument(
        "--fsdp_activation_checkpointing",
        default="false",
        type=str,
        help="Decides Whether (true|false) intermediate activations are freed during the forward pass, and a checkpoint is left as a placeholder. (useful only when `use_fsdp` flag is passed).",
    )

    # megatron_lm args
    megatron_lm_args = parser.add_argument_group("Megatron-LM Arguments", "Arguments related to Megatron-LM.")
    megatron_lm_args.add_argument(
        "--megatron_lm_tp_degree",
        type=int,
        default=1,
        help="Megatron-LM's Tensor Parallelism (TP) degree. (useful only when `use_megatron_lm` flag is passed).",
    )
    megatron_lm_args.add_argument(
        "--megatron_lm_use_custom_fsdp",
        type=bool,
        default=False,
        help="Whether to use custom FSDP. (useful only when `use_megatron_lm` flag is passed).",
    )
    megatron_lm_args.add_argument(
        "--megatron_lm_no_load_optim",
        type=bool,
        default=False,
        help="Whether to not load optimizer. (useful only when `use_megatron_lm` flag is passed).",
    )
    megatron_lm_args.add_argument(
        "--megatron_lm_eod_mask_loss",
        type=bool,
        default=False,
        help="Whether to use eod mask loss. (useful only when `use_megatron_lm` flag is passed).",
    )
    megatron_lm_args.add_argument(
        "--megatron_lm_overlap_cpu_optimizer_d2h_h2d",
        type=bool,
        default=False,
        help="Whether to overlap CPU optimizer step, gradients D2H and updated parameters H2D. (useful only when `use_megatron_lm` flag is passed).",
    )
    megatron_lm_args.add_argument(
        "--megatron_lm_no_save_optim",
        type=bool,
        default=False,
        help="Whether to not save optimizer. (useful only when `use_megatron_lm` flag is passed).",
    )
    megatron_lm_args.add_argument(
        "--megatron_lm_optimizer_cpu_offload",
        type=bool,
        default=False,
        help="Whether to use CPU offload for optimizer. (useful only when `use_megatron_lm` flag is passed).",
    )
    megatron_lm_args.add_argument(
        "--megatron_lm_use_precision_aware_optimizer",
        type=bool,
        default=False,
        help="Whether to use precision aware optimizer. (useful only when `use_megatron_lm` flag is passed).",
    )
    megatron_lm_args.add_argument(
        "--megatron_lm_decoder_last_pipeline_num_layers",
        type=int,
        default=None,
        help="Megatron-LM's decoder last pipeline number of layers, default None is even split of transformer layers across all pipeline stages.",
    )
    megatron_lm_args.add_argument(
        "--megatron_lm_pp_degree",
        type=int,
        default=1,
        help="Megatron-LM's Pipeline Parallelism (PP) degree. (useful only when `use_megatron_lm` flag is passed).",
    )
    megatron_lm_args.add_argument(
        "--megatron_lm_num_micro_batches",
        type=int,
        default=None,
        help="Megatron-LM's number of micro batches when PP degree > 1. (useful only when `use_megatron_lm` flag is passed).",
    )
    megatron_lm_args.add_argument(
        "--megatron_lm_sequence_parallelism",
        default=None,
        type=str,
        help="Decides Whether (true|false) to enable Sequence Parallelism when TP degree > 1. "
        "(useful only when `use_megatron_lm` flag is passed).",
    )
    megatron_lm_args.add_argument(
        "--megatron_lm_recompute_activations",
        default=None,
        type=str,
        help="Decides Whether (true|false) to enable Selective Activation Recomputation. "
        "(useful only when `use_megatron_lm` flag is passed).",
    )
    megatron_lm_args.add_argument(
        "--megatron_lm_use_distributed_optimizer",
        default=None,
        type=str,
        help="Decides Whether (true|false) to use distributed optimizer "
        "which shards optimizer state and gradients across Data Pralellel (DP) ranks. "
        "(useful only when `use_megatron_lm` flag is passed).",
    )
    megatron_lm_args.add_argument(
        "--megatron_lm_gradient_clipping",
        default=1.0,
        type=float,
        help="Megatron-LM's gradient clipping value based on global L2 Norm (0 to disable). "
        "(useful only when `use_megatron_lm` flag is passed).",
    )
    megatron_lm_args.add_argument(
        "--megatron_lm_recompute_granularity",
        default=None,
        type=str,
        help="Megatron-LM's recompute granularity (full, selective). "
        "(useful only when `use_megatron_lm` flag is passed).",
    )
    megatron_lm_args.add_argument(
        "--megatron_lm_recompute_method",
        default=None,
        type=str,
        help="Megatron-LM's recompute method (uniform, block). (useful only when `use_megatron_lm` flag is passed).",
    )
    megatron_lm_args.add_argument(
        "--megatron_lm_recompute_num_layers",
        default=None,
        type=int,
        help="Megatron-LM's number of layers to recompute. (useful only when `use_megatron_lm` flag is passed).",
    )
    megatron_lm_args.add_argument(
        "--megatron_lm_attention_backend",
        default=None,
        type=str,
        help="Decides Whether (true|false) to enable attention backend. "
        "(useful only when `use_megatron_lm` flag is passed).",
    )
    megatron_lm_args.add_argument(
        "--megatron_lm_expert_model_parallel_size",
        default=None,
        type=int,
        help="Megatron-LM's expert model parallel size. (useful only when `use_megatron_lm` flag is passed).",
    )
    megatron_lm_args.add_argument(
        "--megatron_lm_context_parallel_size",
        default=None,
        type=int,
        help="Megatron-LM's context parallel size. (useful only when `use_megatron_lm` flag is passed).",
    )
    megatron_lm_args.add_argument(
        "--megatron_lm_attention_dropout",
        default=None,
        type=float,
        help="Megatron-LM's attention dropout rate. (useful only when `use_megatron_lm` flag is passed).",
    )
    megatron_lm_args.add_argument(
        "--megatron_lm_hidden_dropout",
        default=None,
        type=float,
        help="Megatron-LM's hidden dropout rate. (useful only when `use_megatron_lm` flag is passed).",
    )
    megatron_lm_args.add_argument(
        "--megatron_lm_attention_softmax_in_fp32",
        default=None,
        type=str,
        help="Decides Whether (true|false) to use fp32 for attention softmax. "
        "(useful only when `use_megatron_lm` flag is passed).",
    )
    megatron_lm_args.add_argument(
        "--megatron_lm_expert_tensor_parallel_size",
        default=None,
        type=int,
        help="Megatron-LM's expert tensor parallel size. (useful only when `use_megatron_lm` flag is passed).",
    )
    megatron_lm_args.add_argument(
        "--megatron_lm_calculate_per_token_loss",
        default=None,
        type=str,
        help="Decides Whether (true|false) to calculate per token loss. "
        "(useful only when `use_megatron_lm` flag is passed).",
    )
    megatron_lm_args.add_argument(
        "--megatron_lm_use_rotary_position_embeddings",
        default=None,
        type=str,
        help="Decides Whether (true|false) to use rotary position embeddings. "
        "(useful only when `use_megatron_lm` flag is passed).",
    )

    # FP8 arguments
    fp8_args = parser.add_argument_group(
        "FP8 Arguments", "Arguments related to FP8 training (requires `--mixed_precision=fp8`)"
    )
    fp8_args.add_argument(
        "--fp8_backend",
        type=str,
        choices=["ao", "te", "msamp"],
        help="Choose a backend to train with FP8 (ao: torchao, te: TransformerEngine, msamp: MS-AMP)",
    )
    fp8_args.add_argument(
        "--fp8_use_autocast_during_eval",
        default=False,
        action="store_true",
        help="Whether to use FP8 autocast during eval mode (useful only when `--fp8_backend=te` is passed). Generally better metrics are found when this is not passed.",
    )
    fp8_args.add_argument(
        "--fp8_margin",
        type=int,
        default=0,
        help="The margin to use for the gradient scaling (useful only when `--fp8_backend=te` is passed).",
    )
    fp8_args.add_argument(
        "--fp8_interval",
        type=int,
        default=1,
        help="The interval to use for how often the scaling factor is recomputed (useful only when `--fp8_backend=te` is passed).",
    )
    fp8_args.add_argument(
        "--fp8_format",
        type=str,
        default="HYBRID",
        choices=["HYBRID", "E4M3", "E5M2"],
        help="The format to use for the FP8 recipe (useful only when `--fp8_backend=te` is passed).",
    )
    fp8_args.add_argument(
        "--fp8_amax_history_len",
        type=int,
        default=1024,
        help="The length of the history to use for the scaling factor computation (useful only when `--fp8_backend=te` is passed).",
    )
    fp8_args.add_argument(
        "--fp8_amax_compute_algo",
        type=str,
        default="most_recent",
        choices=["max", "most_recent"],
        help="The algorithm to use for the scaling factor computation. (useful only when `--fp8_backend=te` is passed).",
    )
    fp8_args.add_argument(
        "--fp8_override_linear_precision",
        type=lambda x: tuple(map(str_to_bool, x.split(","))),
        default=(False, False, False),
        help="Whether or not to execute `fprop`, `dgrad`, and `wgrad` GEMMS in higher precision. Should be passed in a comma-separated string of booleans (useful only when `--fp8_backend=te` is passed).",
    )
    fp8_args.add_argument(
        "--fp8_opt_level",
        type=str,
        default="O2",
        choices=["O1", "O2"],
        help="What level of 8-bit collective communication should be used with MS-AMP (useful only when `--fp8_backend=msamp` is passed).",
    )
    fp8_args.add_argument(
        "--fp8_enable_fsdp_float8_all_gather",
        default="true",
        type=str_to_bool,
        help="Whether to enable FSDP2 float8 all gather (useful only when `--fp8_backend=ao` is passed).",
    )
    fp8_args.add_argument(
        "--fp8_pad_inner_dim",
        default="true",
        type=str_to_bool,
        help="Whether to pad the inner dimension for FP8 GEMMs (useful only when `--fp8_backend=ao` is passed).",
    )

    # AWS arguments
    aws_args = parser.add_argument_group("AWS Arguments", "Arguments related to AWS.")
    aws_args.add_argument(
        "--aws_access_key_id",
        type=str,
        default=None,
        help="The AWS_ACCESS_KEY_ID used to launch the Amazon SageMaker training job",
    )
    aws_args.add_argument(
        "--aws_secret_access_key",
        type=str,
        default=None,
        help="The AWS_SECRET_ACCESS_KEY used to launch the Amazon SageMaker training job.",
    )
    parser.add_argument(
        "--debug",
        action="store_true",
        help="Whether to print out the torch.distributed stack trace when something fails.",
    )
    parser.add_argument(
        "training_script",
        type=str,
        help=(
            "The full path to the script to be launched in parallel, followed by all the arguments for the training "
            "script."
        ),
    )

    # MPI arguments
    mpirun_args = parser.add_argument_group("MPI Arguments", "Arguments related to mpirun for Multi-CPU")
    mpirun_args.add_argument(
        "--mpirun_hostfile",
        type=str,
        default=None,
        help="Location for a hostfile for using Accelerate to launch a multi-CPU training job with mpirun. This will "
        "get passed to the MPI --hostfile or -f parameter, depending on which MPI program is installed.",
    )

    # ParallelismConfig arguments
    parallelism_config_args = parser.add_argument_group(
        "ParallelismConfig Arguments",
        "Arguments related to the ParallelismConfig used for distributed training.",
    )

    parallelism_config_args.add_argument(
        "--parallelism_config_dp_replicate_size",
        type=int,
        default=1,
        help="The number of processes for data parallel training. Defaults to 1 (no data parallelism).",
    )

    parallelism_config_args.add_argument(
        "--parallelism_config_dp_shard_size",
        type=int,
        default=1,
        help="The number of processes for FSDP sharding. Defaults to 1 (No FSDP sharding).",
    )

    parallelism_config_args.add_argument(
        "--parallelism_config_tp_size",
        type=int,
        default=1,
        help="The number of processes for tensor parallel training. Defaults to 1 (no tensor parallelism).",
    )

    parallelism_config_args.add_argument(
        "--parallelism_config_cp_size",
        type=int,
        default=1,
        help="The number of processese for context parallel training. Defaults to 1 (no context parallelism).",
    )

    parallelism_config_args.add_argument(
        "--parallelism_config_cp_backend",
        type=str,
        choices=["torch"],
        default="torch",
        help="Context Parallelism backend: torch (FSDP2) or deepspeed (ALST/Ulysses)",
    )

    parallelism_config_args.add_argument(
        "--parallelism_config_cp_comm_strategy",
        type=str,
        default="allgather",
        help="The communication strategy for context parallel training. Defaults to 'allgather'. Other option is alltoall",
    )

    parallelism_config_args.add_argument(
        "--parallelism_config_sp_size",
        type=int,
        default=1,
        help="The number of processese for context parallel training. Defaults to 1 (no context parallelism).",
    )

    parallelism_config_args.add_argument(
        "--parallelism_config_sp_backend",
        type=str,
        choices=["deepspeed"],
        default="deepspeed",
        help="Sequence Parallelism backend: deepspeed (ALST/Ulysses)",
    )

    parallelism_config_args.add_argument(
        "--parallelism_config_sp_seq_length",
        type=str,
        default=None,
        help="Sequence length for when batches are all of the same length. For variable sequence lengths across batches set `parallelism_config_sp_seq_length_is_variable=True`",
    )

    parallelism_config_args.add_argument(
        "--parallelism_config_sp_seq_length_is_variable",
        type=bool,
        default=True,
        help="If `True` will work with a sequence length that may change between batches, in which case `parallelism_config_sp_seq_length` value can be set to anything divisible by sp size or remain unset. If `False` then `parallelism_config_sp_seq_length` needs to match the batch's sequence length dimension. The default is `True`.",
    )

    parallelism_config_args.add_argument(
        "--parallelism_config_sp_attn_implementation",
        type=str,
        default="sdpa",
        help="Attention implementation to use. Can be one of 'flash_attention_2', 'flash_attention_3', 'sdpa', or a hub-hosted kernel (e.g. 'kernels-community/flash-attn2'). Defaults to `sdpa`.",
    )

    # Other arguments of the training scripts
    parser.add_argument("training_script_args", nargs=argparse.REMAINDER, help="Arguments of the training script.")

    if subparsers is not None:
        parser.set_defaults(func=launch_command)
    return parser


def simple_launcher(args):
    cmd, current_env = prepare_simple_launcher_cmd_env(args)

    process = subprocess.Popen(cmd, env=current_env)
    process.wait()
    if process.returncode != 0:
        if not args.quiet:
            raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)
        else:
            sys.exit(1)


def multi_gpu_launcher(args):
    import torch.distributed.run as distrib_run

    current_env = prepare_multi_gpu_env(args)
    if not check_cuda_p2p_ib_support():
        message = "Using RTX 4000 series which doesn't support faster communication speedups. Ensuring P2P and IB communications are disabled."
        warn = False
        if "NCCL_P2P_DISABLE" not in current_env:
            current_env["NCCL_P2P_DISABLE"] = "1"
            warn = True
        if "NCCL_IB_DISABLE" not in current_env:
            current_env["NCCL_IB_DISABLE"] = "1"
            warn = True
        if warn:
            logger.warning(message)

    debug = getattr(args, "debug", False)
    args = _filter_args(
        args,
        distrib_run.get_args_parser(),
        ["--training_script", args.training_script, "--training_script_args", args.training_script_args],
    )

    with patch_environment(**current_env):
        try:
            distrib_run.run(args)
        except Exception:
            if is_rich_available() and debug:
                console = get_console()
                console.print("\n[bold red]Using --debug, `torch.distributed` Stack Trace:[/bold red]")
                console.print_exception(suppress=[__file__], show_locals=False)
            else:
                raise


def deepspeed_launcher(args):
    import torch.distributed.run as distrib_run

    if not is_deepspeed_available():
        raise ImportError("DeepSpeed is not installed => run `pip3 install deepspeed` or build it from source.")
    else:
        from deepspeed.launcher.runner import DEEPSPEED_ENVIRONMENT_NAME

    cmd, current_env = prepare_deepspeed_cmd_env(args)
    if not check_cuda_p2p_ib_support():
        message = "Using RTX 4000 series which doesn't support faster communication speedups. Ensuring P2P and IB communications are disabled."
        warn = False
        if "NCCL_P2P_DISABLE" not in current_env:
            current_env["NCCL_P2P_DISABLE"] = "1"
            warn = True
        if "NCCL_IB_DISABLE" not in current_env:
            current_env["NCCL_IB_DISABLE"] = "1"
            warn = True
        if warn:
            logger.warning(message)

    if args.num_machines > 1 and args.deepspeed_multinode_launcher != DEEPSPEED_MULTINODE_LAUNCHERS[1]:
        with open(DEEPSPEED_ENVIRONMENT_NAME, "a") as f:
            valid_env_items = convert_dict_to_env_variables(current_env)
            if len(valid_env_items) > 1:
                f.writelines(valid_env_items)

        process = subprocess.Popen(cmd, env=current_env)
        process.wait()
        if process.returncode != 0:
            if not args.quiet:
                raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)
            else:
                sys.exit(1)
    else:
        debug = getattr(args, "debug", False)
        args = _filter_args(
            args,
            distrib_run.get_args_parser(),
            ["--training_script", args.training_script, "--training_script_args", args.training_script_args],
        )
        with patch_environment(**current_env):
            try:
                distrib_run.run(args)
            except Exception:
                if is_rich_available() and debug:
                    console = get_console()
                    console.print("\n[bold red]Using --debug, `torch.distributed` Stack Trace:[/bold red]")
                    console.print_exception(suppress=[__file__], show_locals=False)
                else:
                    raise


def tpu_launcher(args):
    import torch_xla.distributed.xla_multiprocessing as xmp

    if args.no_python:
        raise ValueError("--no_python cannot be used with TPU launcher")

    args, current_env = prepare_tpu(args, {})

    if args.module:
        mod_name = args.training_script
    else:
        # Import training_script as a module
        script_path = Path(args.training_script)
        sys.path.append(str(script_path.parent.resolve()))
        mod_name = script_path.stem

    mod = importlib.import_module(mod_name)
    if not hasattr(mod, args.main_training_function):
        raise ValueError(
            f"Your training script should have a function named {args.main_training_function}, or you should pass a "
            "different value to `--main_training_function`."
        )

    # Patch sys.argv
    sys.argv = [mod.__file__] + args.training_script_args

    main_function = getattr(mod, args.main_training_function)
    with patch_environment(**current_env):
        xmp.spawn(PrepareForLaunch(main_function), args=())


def tpu_pod_launcher(args):
    from torch_xla.distributed import xla_dist

    current_env = {}
    args, current_env = prepare_tpu(args, current_env, True)
    debug = getattr(args, "debug", False)

    training_script = args.training_script
    training_script_args = args.training_script_args
    new_args = _filter_args(
        args, xla_dist.get_args_parser(), ["--tpu", args.tpu_name, "--positional", "", "--restart-tpuvm-pod-server"]
    )

    if args.tpu_use_sudo:
        new_cmd = ["sudo"]
    else:
        new_cmd = []

    new_cmd += [
        "accelerate-launch",
        "--tpu",
        "--no_tpu_cluster",
        "--num_machines",
        "1",
        "--mixed_precision",
        "no",
        "--dynamo_backend",
        "no",
        "--num_processes",
        str(args.num_processes),
        "--main_training_function",
        str(args.main_training_function),
        training_script,
    ] + training_script_args

    new_args.positional = new_cmd
    bad_flags = ""
    for arg in vars(new_args):
        if arg.startswith("docker_"):
            value = getattr(new_args, arg)
            if value != "" and value is not None:
                bad_flags += f'{arg}="{value}"\n'
    if bad_flags != "":
        raise ValueError(
            f"Docker containers are not supported for TPU pod launcher currently, please remove the following flags:\n{bad_flags}"
        )
    new_args.env = [f"{k}={v}" for k, v in current_env.items()]
    new_args.env.append("ACCELERATE_IN_TPU_POD=1")
    try:
        xla_dist.resolve_and_execute(new_args)
    except Exception:
        if is_rich_available() and debug:
            console = get_console()
            console.print("\n[bold red]Using --debug, `torch_xla.xla_dist` Stack Trace:[/bold red]")
            console.print_exception(suppress=[__file__], show_locals=False)
        else:
            raise


def sagemaker_launcher(sagemaker_config: SageMakerConfig, args):
    if not is_sagemaker_available():
        raise ImportError(
            "Please install sagemaker to be able to launch training on Amazon SageMaker with `pip install accelerate[sagemaker]`"
        )
    if args.module or args.no_python:
        raise ValueError(
            "SageMaker requires a python training script file and cannot be used with --module or --no_python"
        )

    from sagemaker.huggingface import HuggingFace

    args, sagemaker_inputs = prepare_sagemager_args_inputs(sagemaker_config, args)

    huggingface_estimator = HuggingFace(**args)

    huggingface_estimator.fit(inputs=sagemaker_inputs)
    print(f"You can find your model data at: {huggingface_estimator.model_data}")


def _validate_launch_command(args):
    # Sanity checks
    if sum([args.multi_gpu, args.cpu, args.tpu, args.use_deepspeed, args.use_fsdp]) > 1:
        raise ValueError(
            "You can only use one of `--cpu`, `--multi_gpu`, `--tpu`, `--use_deepspeed`, `--use_fsdp` at a time."
        )
    if args.multi_gpu and (args.num_processes is not None) and (args.num_processes < 2):
        raise ValueError("You need to use at least 2 processes to use `--multi_gpu`.")

    if (not args.use_fsdp or args.fsdp_version == 1) and args.use_parallelism_config:
        raise ValueError("You cannot use `--use_parallelism_config` without `--use_fsdp` and `--fsdp_version=2`. ")

    defaults = None
    warned = []
    mp_from_config_flag = False
    # Get the default from the config file.
    if args.config_file is not None or os.path.isfile(default_config_file) and not args.cpu:
        defaults = load_config_from_file(args.config_file)
        if (
            not args.multi_gpu
            and not args.tpu
            and not args.tpu_use_cluster
            and not args.use_deepspeed
            and not args.use_fsdp
            and not args.use_megatron_lm
        ):
            args.use_deepspeed = defaults.distributed_type == DistributedType.DEEPSPEED
            args.multi_gpu = (
                True
                if defaults.distributed_type
                in (
                    DistributedType.MULTI_GPU,
                    DistributedType.MULTI_NPU,
                    DistributedType.MULTI_MLU,
                    DistributedType.MULTI_SDAA,
                    DistributedType.MULTI_MUSA,
                    DistributedType.MULTI_XPU,
                    DistributedType.MULTI_HPU,
                    DistributedType.MULTI_NEURON,
                )
                else False
            )
            args.tpu = defaults.distributed_type == DistributedType.XLA
            args.use_fsdp = defaults.distributed_type == DistributedType.FSDP
            args.use_megatron_lm = defaults.distributed_type == DistributedType.MEGATRON_LM
            args.tpu_use_cluster = defaults.tpu_use_cluster if args.tpu else False
            args.use_parallelism_config = defaults.parallelism_config != {}
        if args.gpu_ids is None:
            if defaults.gpu_ids is not None:
                args.gpu_ids = defaults.gpu_ids
            else:
                args.gpu_ids = "all"

        if args.multi_gpu and args.num_machines is None:
            args.num_machines = defaults.num_machines

        if len(args.gpu_ids.split(",")) < 2 and (args.gpu_ids != "all") and args.multi_gpu and args.num_machines <= 1:
            raise ValueError(
                "Less than two GPU ids were configured and tried to run on on multiple GPUs. "
                "Please ensure at least two are specified for `--gpu_ids`, or use `--gpu_ids='all'`."
            )
        if defaults.compute_environment == ComputeEnvironment.LOCAL_MACHINE:
            # Update args with the defaults
            for name, attr in defaults.__dict__.items():
                if isinstance(attr, dict):
                    # Copy defaults.somedict.somearg to args.somearg and
                    # defaults.fsdp_config.x to args.fsdp_x
                    for key, value in attr.items():
                        if name == "fsdp_config" and not key.startswith("fsdp"):
                            key = "fsdp_" + key
                        elif name == "fp8_config" and not key.startswith("fp8"):
                            key = "fp8_" + key
                        if hasattr(args, "nondefault") and key not in args.nondefault:
                            setattr(args, key, value)
                elif (
                    name not in ["compute_environment", "mixed_precision", "distributed_type"]
                    and getattr(args, name, None) is None
                ):
                    # Those args are handled separately
                    setattr(args, name, attr)
        if not args.debug:
            args.debug = defaults.debug

        if not args.mixed_precision:
            if defaults.mixed_precision is None:
                args.mixed_precision = "no"
            else:
                args.mixed_precision = defaults.mixed_precision
                mp_from_config_flag = True
        else:
            native_amp = is_bf16_available(True)
            if (
                args.mixed_precision == "bf16"
                and not native_amp
                and not (args.tpu and is_torch_xla_available(check_is_tpu=True))
            ):
                raise ValueError("bf16 mixed precision requires PyTorch >= 1.10 and a supported device.")

        # Silently set the default here
        if args.dynamo_backend is None:
            args.dynamo_backend = "no"
        if args.num_processes == -1:
            raise ValueError("You need to manually pass in `--num_processes` using this config yaml.")
    else:
        if args.num_processes is None:
            if is_xpu_available():
                args.num_processes = torch.xpu.device_count()
            elif is_mlu_available():
                args.num_processes = torch.mlu.device_count()
            elif is_sdaa_available():
                args.num_processes = torch.sdaa.device_count()
            elif is_musa_available():
                args.num_processes = torch.musa.device_count()
            elif is_npu_available():
                args.num_processes = torch.npu.device_count()
            elif is_hpu_available():
                args.num_processes = torch.hpu.device_count()
            elif is_neuron_available():
                args.num_processes = torch.neuron.device_count()
            else:
                args.num_processes = torch.cuda.device_count()
            warned.append(f"\t`--num_processes` was set to a value of `{args.num_processes}`")
        if args.debug is None:
            args.debug = False
        if (
            not args.multi_gpu
            and args.num_processes > 1
            and (
                (is_xpu_available() and torch.xpu.device_count() > 1)
                or (is_npu_available() and torch.npu.device_count() > 1)
                or (is_hpu_available() and torch.hpu.device_count() > 1)
                or (is_mlu_available() and torch.mlu.device_count() > 1)
                or (is_sdaa_available() and torch.sdaa.device_count() > 1)
                or (is_musa_available() and torch.musa.device_count() > 1)
                or (is_neuron_available() and torch.neuron.device_count() > 1)
                or (torch.cuda.is_available() and torch.cuda.device_count() > 1)
            )
        ):
            warned.append(
                "\t\tMore than one GPU was found, enabling multi-GPU training.\n"
                "\t\tIf this was unintended please pass in `--num_processes=1`."
            )
            args.multi_gpu = True
        if args.num_machines is None:
            warned.append("\t`--num_machines` was set to a value of `1`")
            args.num_machines = 1
        if args.mixed_precision is None:
            warned.append("\t`--mixed_precision` was set to a value of `'no'`")
            args.mixed_precision = "no"
        if not hasattr(args, "use_cpu"):
            args.use_cpu = args.cpu
        if args.dynamo_backend is None:
            warned.append("\t`--dynamo_backend` was set to a value of `'no'`")
            args.dynamo_backend = "no"
    if args.debug:
        logger.debug("Running script in debug mode, expect distributed operations to be slightly slower.")

    is_aws_env_disabled = defaults is None or (
        defaults is not None and defaults.compute_environment != ComputeEnvironment.AMAZON_SAGEMAKER
    )
    if is_aws_env_disabled and args.num_cpu_threads_per_process is None:
        args.num_cpu_threads_per_process = get_int_from_env(["OMP_NUM_THREADS"], 1)
        if args.use_cpu and args.num_processes >= 1 and get_int_from_env(["OMP_NUM_THREADS"], 0) == 0:
            local_size = get_int_from_env(
                ["MPI_LOCALNRANKS", "OMPI_COMM_WORLD_LOCAL_SIZE", "MV2_COMM_WORLD_LOCAL_SIZE"],
                max(int(args.num_processes / args.num_machines), 1),
            )
            import psutil

            threads_per_process = int(psutil.cpu_count(logical=False) / local_size)
            if threads_per_process > 1:
                args.num_cpu_threads_per_process = threads_per_process
                warned.append(
                    f"\t`--num_cpu_threads_per_process` was set to `{args.num_cpu_threads_per_process}` to improve out-of-box performance when training on CPUs"
                )

    if any(warned):
        message = "The following values were not passed to `accelerate launch` and had defaults used instead:\n"
        message += "\n".join(warned)
        message += (
            "\nTo avoid this warning pass in values for each of the problematic parameters or run `accelerate config`."
        )
        logger.warning(message)
    return args, defaults, mp_from_config_flag


def launch_command(args):
    args, defaults, mp_from_config_flag = _validate_launch_command(args)
    # Use the proper launcher
    if args.use_deepspeed and not args.cpu:
        args.deepspeed_fields_from_accelerate_config = list(defaults.deepspeed_config.keys()) if defaults else []
        if mp_from_config_flag:
            args.deepspeed_fields_from_accelerate_config.append("mixed_precision")
        args.deepspeed_fields_from_accelerate_config = ",".join(args.deepspeed_fields_from_accelerate_config)
        deepspeed_launcher(args)
    elif args.use_fsdp and not args.cpu:
        multi_gpu_launcher(args)
    elif args.use_megatron_lm and not args.cpu:
        multi_gpu_launcher(args)
    elif args.multi_gpu and not args.cpu:
        multi_gpu_launcher(args)
    elif args.tpu and not args.cpu:
        if args.tpu_use_cluster:
            tpu_pod_launcher(args)
        else:
            tpu_launcher(args)
    elif defaults is not None and defaults.compute_environment == ComputeEnvironment.AMAZON_SAGEMAKER:
        sagemaker_launcher(defaults, args)
    else:
        simple_launcher(args)


def main():
    parser = launch_command_parser()
    args = parser.parse_args()
    launch_command(args)


if __name__ == "__main__":
    main()


================================================
FILE: src/accelerate/commands/menu/__init__.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .selection_menu import BulletMenu


================================================
FILE: src/accelerate/commands/menu/cursor.py
================================================
# Copyright 2022 The HuggingFace Team and Brian Chao. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
A utility for showing and hiding the terminal cursor on Windows and Linux, based on https://github.com/bchao1/bullet
"""

import os
import sys
from contextlib import contextmanager


# Windows only
if os.name == "nt":
    import ctypes
    import msvcrt  # noqa

    class CursorInfo(ctypes.Structure):
        # _fields is a specific attr expected by ctypes
        _fields_ = [("size", ctypes.c_int), ("visible", ctypes.c_byte)]


def hide_cursor():
    if os.name == "nt":
        ci = CursorInfo()
        handle = ctypes.windll.kernel32.GetStdHandle(-11)
        ctypes.windll.kernel32.GetConsoleCursorInfo(handle, ctypes.byref(ci))
        ci.visible = False
        ctypes.windll.kernel32.SetConsoleCursorInfo(handle, ctypes.byref(ci))
    elif os.name == "posix":
        sys.stdout.write("\033[?25l")
        sys.stdout.flush()


def show_cursor():
    if os.name == "nt":
        ci = CursorInfo()
        handle = ctypes.windll.kernel32.GetStdHandle(-11)
        ctypes.windll.kernel32.GetConsoleCursorInfo(handle, ctypes.byref(ci))
        ci.visible = True
        ctypes.windll.kernel32.SetConsoleCursorInfo(handle, ctypes.byref(ci))
    elif os.name == "posix":
        sys.stdout.write("\033[?25h")
        sys.stdout.flush()


@contextmanager
def hide():
    "Context manager to hide the terminal cursor"
    try:
        hide_cursor()
        yield
    finally:
        show_cursor()


================================================
FILE: src/accelerate/commands/menu/helpers.py
================================================
# Copyright 2022 The HuggingFace Team and Brian Chao. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
A variety of helper functions and constants when dealing with terminal menu choices, based on
https://github.com/bchao1/bullet
"""

import enum
import shutil
import sys


TERMINAL_WIDTH, _ = shutil.get_terminal_size()

CURSOR_TO_CHAR = {"UP": "A", "DOWN": "B", "RIGHT": "C", "LEFT": "D"}


class Direction(enum.Enum):
    UP = 0
    DOWN = 1


def forceWrite(content, end=""):
    sys.stdout.write(str(content) + end)
    sys.stdout.flush()


def writeColor(content, color, end=""):
    forceWrite(f"\u001b[{color}m{content}\u001b[0m", end)


def reset_cursor():
    forceWrite("\r")


def move_cursor(num_lines: int, direction: str):
    forceWrite(f"\033[{num_lines}{CURSOR_TO_CHAR[direction.upper()]}")


def clear_line():
    forceWrite(" " * TERMINAL_WIDTH)
    reset_cursor()


def linebreak():
    reset_cursor()
    forceWrite("-" * TERMINAL_WIDTH)


================================================
FILE: src/accelerate/commands/menu/input.py
================================================
# Copyright 2022 The HuggingFace Team and Brian Chao. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
This file contains utilities for handling input from the user and registering specific keys to specific functions,
based on https://github.com/bchao1/bullet
"""

from .keymap import KEYMAP, get_character


def mark(key: str):
    """
    Mark the function with the key code so it can be handled in the register
    """

    def decorator(func):
        handle = getattr(func, "handle_key", [])
        handle += [key]
        func.handle_key = handle
        return func

    return decorator


def mark_multiple(*keys: list[str]):
    """
    Mark the function with the key codes so it can be handled in the register
    """

    def decorator(func):
        handle = getattr(func, "handle_key", [])
        handle += keys
        func.handle_key = handle
        return func

    return decorator


class KeyHandler(type):
    """
    Metaclass that adds the key handlers to the class
    """

    def __new__(cls, name, bases, attrs):
        new_cls = super().__new__(cls, name, bases, attrs)
        if not hasattr(new_cls, "key_handler"):
            new_cls.key_handler = {}
        new_cls.handle_input = KeyHandler.handle_input

        for value in attrs.values():
            handled_keys = getattr(value, "handle_key", [])
            for key in handled_keys:
                new_cls.key_handler[key] = value
        return new_cls

    @staticmethod
    def handle_input(cls):
        "Finds and returns the selected character if it exists in the handler"
        char = get_character()
        if char != KEYMAP["undefined"]:
            char = ord(char)
        handler = cls.key_handler.get(char)
        if handler:
            cls.current_selection = char
            return handler(cls)
        else:
            return None


def register(cls):
    """Adds KeyHandler metaclass to the class"""
    return KeyHandler(cls.__name__, cls.__bases__, cls.__dict__.copy())


================================================
FILE: src/accelerate/commands/menu/keymap.py
================================================
# Copyright 2022 The HuggingFace Team and Brian Chao. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Utilities relating to parsing raw characters from the keyboard, based on https://github.com/bchao1/bullet
"""

import os
import string
import sys


ARROW_KEY_FLAG = 1 << 8

KEYMAP = {
    "tab": ord("\t"),
    "newline": ord("\r"),
    "esc": 27,
    "up": 65 + ARROW_KEY_FLAG,
    "down": 66 + ARROW_KEY_FLAG,
    "right": 67 + ARROW_KEY_FLAG,
    "left": 68 + ARROW_KEY_FLAG,
    "mod_int": 91,
    "undefined": sys.maxsize,
    "interrupt": 3,
    "insert": 50,
    "delete": 51,
    "pg_up": 53,
    "pg_down": 54,
}

KEYMAP["arrow_begin"] = KEYMAP["up"]
KEYMAP["arrow_end"] = KEYMAP["left"]

if sys.platform == "win32":
    WIN_CH_BUFFER = []
    WIN_KEYMAP = {
        b"\xe0H": KEYMAP["up"] - ARROW_KEY_FLAG,
        b"\x00H": KEYMAP["up"] - ARROW_KEY_FLAG,
        b"\xe0P": KEYMAP["down"] - ARROW_KEY_FLAG,
        b"\x00P": KEYMAP["down"] - ARROW_KEY_FLAG,
        b"\xe0M": KEYMAP["right"] - ARROW_KEY_FLAG,
        b"\x00M": KEYMAP["right"] - ARROW_KEY_FLAG,
        b"\xe0K": KEYMAP["left"] - ARROW_KEY_FLAG,
        b"\x00K": KEYMAP["left"] - ARROW_KEY_FLAG,
    }

for i in range(10):
    KEYMAP[str(i)] = ord(str(i))


def get_raw_chars():
    "Gets raw characters from inputs"
    if os.name == "nt":
        import msvcrt

        encoding = "mbcs"
        # Flush the keyboard buffer
        while msvcrt.kbhit():
            msvcrt.getch()
        if len(WIN_CH_BUFFER) == 0:
            # Read the keystroke
            ch = msvcrt.getch()

            # If it is a prefix char, get second part
            if ch in (b"\x00", b"\xe0"):
                ch2 = ch + msvcrt.getch()
                # Translate actual Win chars to bullet char types
                try:
                    chx = chr(WIN_KEYMAP[ch2])
                    WIN_CH_BUFFER.append(chr(KEYMAP["mod_int"]))
                    WIN_CH_BUFFER.append(chx)
                    if ord(chx) in (
                        KEYMAP["insert"] - 1 << 9,
                        KEYMAP["delete"] - 1 << 9,
                        KEYMAP["pg_up"] - 1 << 9,
                        KEYMAP["pg_down"] - 1 << 9,
                    ):
                        WIN_CH_BUFFER.append(chr(126))
                    ch = chr(KEYMAP["esc"])
                except KeyError:
                    ch = ch2[1]
            else:
                ch = ch.decode(encoding)
        else:
            ch = WIN_CH_BUFFER.pop(0)
    elif os.name == "posix":
        import termios
        import tty

        fd = sys.stdin.fileno()
        old_settings = termios.tcgetattr(fd)
        try:
            tty.setraw(fd)
            ch = sys.stdin.read(1)
        finally:
            termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
    return ch


def get_character():
    "Gets a character from the keyboard and returns the key code"
    char = get_raw_chars()
    if ord(char) in [KEYMAP["interrupt"], KEYMAP["newline"]]:
        return char

    elif ord(char) == KEYMAP["esc"]:
        combo = get_raw_chars()
        if ord(combo) == KEYMAP["mod_int"]:
            key = get_raw_chars()
            if ord(key) >= KEYMAP["arrow_begin"] - ARROW_KEY_FLAG and ord(key) <= KEYMAP["arrow_end"] - ARROW_KEY_FLAG:
                return chr(ord(key) + ARROW_KEY_FLAG)
            else:
                return KEYMAP["undefined"]
        else:
            return get_raw_chars()

    else:
        if char in string.printable:
            return char
        else:
            return KEYMAP["undefined"]


================================================
FILE: src/accelerate/commands/menu/selection_menu.py
================================================
# Copyright 2022 The HuggingFace Team and Brian Chao. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Main driver for the selection menu, based on https://github.com/bchao1/bullet
"""

import builtins
import sys
from typing import Optional

from ...utils.imports import _is_package_available
from . import cursor, input
from .helpers import Direction, clear_line, forceWrite, linebreak, move_cursor, reset_cursor, writeColor
from .keymap import KEYMAP


in_colab = False
try:
    in_colab = _is_package_available("google.colab")
except ModuleNotFoundError:
    pass


@input.register
class BulletMenu:
    """
    A CLI menu to select a choice from a list of choices using the keyboard.
    """

    def __init__(self, prompt: Optional[str] = None, choices: list = []):
        self.position = 0
        self.choices = choices
        self.prompt = prompt
        if sys.platform == "win32":
            self.arrow_char = "*"
        else:
            self.arrow_char = "➔ "

    def write_choice(self, index, end: str = ""):
        if sys.platform != "win32":
            writeColor(self.choices[index], 32, end)
        else:
            forceWrite(self.choices[index], end)

    def print_choice(self, index: int):
        "Prints the choice at the given index"
        if index == self.position:
            forceWrite(f" {self.arrow_char} ")
            self.write_choice(index)
        else:
            forceWrite(f"    {self.choices[index]}")
        reset_cursor()

    def move_direction(self, direction: Direction, num_spaces: int = 1):
        "Should not be directly called, used to move a direction of either up or down"
        old_position = self.position
        if direction == Direction.DOWN:
            if self.position + 1 >= len(self.choices):
                return
            self.position += num_spaces
        else:
            if self.position - 1 < 0:
                return
            self.position -= num_spaces
        clear_line()
        self.print_choice(old_position)
        move_cursor(num_spaces, direction.name)
        self.print_choice(self.position)

    @input.mark(KEYMAP["up"])
    def move_up(self):
        self.move_direction(Direction.UP)

    @input.mark(KEYMAP["down"])
    def move_down(self):
        self.move_direction(Direction.DOWN)

    @input.mark(KEYMAP["newline"])
    def select(self):
        move_cursor(len(self.choices) - self.position, "DOWN")
        return self.position

    @input.mark(KEYMAP["interrupt"])
    def interrupt(self):
        move_cursor(len(self.choices) - self.position, "DOWN")
        raise KeyboardInterrupt

    @input.mark_multiple(*[KEYMAP[str(number)] for number in range(10)])
    def select_row(self):
        index = int(chr(self.current_selection))
        movement = index - self.position
        if index == self.position:
            return
        if index < len(self.choices):
            if self.position > index:
                self.move_direction(Direction.UP, -movement)
            elif self.position < index:
                self.move_direction(Direction.DOWN, movement)
            else:
                return
        else:
            return

    def run(self, default_choice: int = 0):
        "Start the menu and return the selected choice"
        if self.prompt:
            linebreak()
            forceWrite(self.prompt, "\n")
            if in_colab:
                forceWrite("Please input a choice index (starting from 0), and press enter", "\n")
            else:
                forceWrite("Please select a choice using the arrow or number keys, and selecting with enter", "\n")
        self.position = default_choice
        for i in range(len(self.choices)):
            self.print_choice(i)
            forceWrite("\n")
        move_cursor(len(self.choices) - self.position, "UP")
        with cursor.hide():
            while True:
                if in_colab:
                    try:
                        choice = int(builtins.input())
                    except ValueError:
                        choice = default_choice
                else:
                    choice = self.handle_input()
                if choice is not None:
                    reset_cursor()
                    for _ in range(len(self.choices) + 1):
                        move_cursor(1, "UP")
                        clear_line()
                    self.write_choice(choice, "\n")
                    return choice


================================================
FILE: src/accelerate/commands/merge.py
================================================
#!/usr/bin/env python

# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from accelerate.commands.utils import CustomArgumentParser
from accelerate.utils import merge_fsdp_weights


description = """Utility to merge the weights from multiple FSDP checkpoints into a single combined checkpoint. Should be used if
`SHARDED_STATE_DICT` was used for the model. Weights will be saved to `{output_path}`.

This is a CPU-bound process and requires enough RAM to load the entire model state dict."""


def merge_command(args):
    merge_fsdp_weights(
        args.checkpoint_directory, args.output_path, not args.unsafe_serialization, args.remove_checkpoint_dir
    )


def merge_command_parser(subparsers=None):
    if subparsers is not None:
        parser = subparsers.add_parser("merge-weights", description=description)
    else:
        parser = CustomArgumentParser(description=description)

    parser.add_argument("checkpoint_directory", type=str, help="A directory containing sharded weights saved by FSDP.")
    parser.add_argument(
        "output_path",
        type=str,
        help="The path to save the merged weights. Defaults to the current directory. ",
    )
    parser.add_argument(
        "--unsafe_serialization",
        action="store_true",
        default=False,
        help="Whether to save the merged weights as `.bin` rather than `.safetensors` (not recommended).",
    )
    parser.add_argument(
        "--remove_checkpoint_dir",
        action="store_true",
        help="Whether to remove the checkpoint directory after merging.",
        default=False,
    )

    if subparsers is not None:
        parser.set_defaults(func=merge_command)
    return parser


def main():
    parser = merge_command_parser()
    args = parser.parse_args()
    merge_command(args)


if __name__ == "__main__":
    main()


================================================
FILE: src/accelerate/commands/test.py
================================================
#!/usr/bin/env python

# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse

from accelerate.test_utils import execute_subprocess_async, path_in_accelerate_package


def test_command_parser(subparsers=None):
    if subparsers is not None:
        parser = subparsers.add_parser("test")
    else:
        parser = argparse.ArgumentParser("Accelerate test command")

    parser.add_argument(
        "--config_file",
        default=None,
        help=(
            "The path to use to store the config file. Will default to a file named default_config.yaml in the cache "
            "location, which is the content of the environment `HF_HOME` suffixed with 'accelerate', or if you don't have "
            "such an environment variable, your cache directory ('~/.cache' or the content of `XDG_CACHE_HOME`) suffixed "
            "with 'huggingface'."
        ),
    )

    if subparsers is not None:
        parser.set_defaults(func=test_command)
    return parser


def test_command(args):
    script_name = path_in_accelerate_package("test_utils", "scripts", "test_script.py")

    if args.config_file is None:
        test_args = [script_name]
    else:
        test_args = f"--config_file={args.config_file} {script_name}".split()

    cmd = ["accelerate-launch"] + test_args
    result = execute_subprocess_async(cmd)
    if result.returncode == 0:
        print("Test is a success! You are ready for your distributed training!")


def main():
    parser = test_command_parser()
    args = parser.parse_args()
    test_command(args)


if __name__ == "__main__":
    main()


================================================
FILE: src/accelerate/commands/to_fsdp2.py
================================================
#!/usr/bin/env python

# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import enum
import logging
from pathlib import Path

import yaml

from accelerate.commands.utils import CustomArgumentParser


class ConversionStatus(enum.Enum):
    NOT_YET_IMPLEMENTED = 0
    REMOVED = -1


ARGUMENT_KEY_MAPPING = {
    # New keys in FSDP2
    "fsdp_version": "fsdp_version",
    "fsdp_reshard_after_forward": "fsdp_reshard_after_forward",
    # https://github.com/pytorch/torchtitan/blob/main/docs/fsdp.md
    # https://huggingface.co/docs/accelerate/en/usage_guides/fsdp
    "fsdp_auto_wrap_policy": "fsdp_auto_wrap_policy",
    "fsdp_backward_prefetch": ConversionStatus.REMOVED,
    "fsdp_forward_prefetch": ConversionStatus.NOT_YET_IMPLEMENTED,
    "fsdp_cpu_ram_efficient_loading": "fsdp_cpu_ram_efficient_loading",
    "fsdp_offload_params": "fsdp_offload_params",
    "fsdp_sharding_strategy": "fsdp_reshard_after_forward",
    "fsdp_state_dict_type": "fsdp_state_dict_type",
    "fsdp_sync_module_states": ConversionStatus.REMOVED,
    "fsdp_transformer_layer_cls_to_wrap": "fsdp_transformer_layer_cls_to_wrap",
    "fsdp_min_num_params": "fsdp_min_num_params",
    "fsdp_use_orig_params": ConversionStatus.REMOVED,
    "fsdp_activation_checkpointing": "fsdp_activation_checkpointing",
}

ARGUMENT_VALUE_MAPPING = {
    "fsdp_sharding_strategy": {
        "FULL_SHARD": True,
        "SHARD_GRAD_OP": False,
        "HYBRID_SHARD": True,
        "HYBRID_SHARD_ZERO2": False,
        "NO_SHARD": False,
    },
    "fsdp_reshard_after_forward": {  # Needed to convert newly created configs using FSDP1 to FSDP2
        "FULL_SHARD": True,
        "SHARD_GRAD_OP": False,
        "HYBRID_SHARD": True,
        "HYBRID_SHARD_ZERO2": False,
        "NO_SHARD": False,
    },
}

logger = logging.getLogger(__name__)


def _validate_to_fsdp2_args(args):
    if not Path(args.config_file).exists():
        raise FileNotFoundError(f"Config file {args.config_file} not found")

    if not args.overwrite and args.output_file is None:
        raise ValueError("If --overwrite is not set, --output_file must be provided")

    if not args.overwrite and Path(args.output_file).exists():
        raise FileExistsError(f"Output file {args.output_file} already exists and --overwrite is not set")


def convert_config_to_fsdp2(config: dict) -> dict:
    fsdp_config = config.get("fsdp_config", {})

    if not fsdp_config:
        logger.info("No FSDP config found in the config file, skipping conversion...")
        return config

    new_fsdp_config = {}

    if fsdp_config.get("fsdp_version", 1) == 2:
        logger.warning("Config already specifies FSDP2, skipping conversion...")
        logger.warning(
            "If the config doesn't use new argument names, change `fsdp_version` to `1` and rerun the command."
        )
        return config

    for key, value in fsdp_config.items():
        conversion_status = ARGUMENT_KEY_MAPPING.get(key, None)
        if isinstance(conversion_status, ConversionStatus) or conversion_status is None:
            conversion_status = key
            new_fsdp_config[conversion_status] = value
            continue

        if conversion_status == ConversionStatus.REMOVED:
            logger.warning(f"Argument {key} has been removed in FSDP2, skipping this key...")
            continue

        if conversion_status == ConversionStatus.NOT_YET_IMPLEMENTED:
            logger.warning(f"Argument {key} is not yet implemented in FSDP2, skipping this key...")
            continue

        if conversion_status is None:
            logger.warning(f"Argument {key} is not being converted, skipping this key...")
            new_fsdp_config[key] = value
        else:
            if key in ARGUMENT_VALUE_MAPPING:
                value = ARGUMENT_VALUE_MAPPING[key].get(value, value)
            new_fsdp_config[ARGUMENT_KEY_MAPPING[key]] = value

    new_fsdp_config["fsdp_version"] = 2
    config["fsdp_config"] = new_fsdp_config
    return config


def to_fsdp2_command_parser(subparsers=None):
    description = "Convert an Accelerate config from FSDP1 to FSDP2"

    if subparsers is not None:
        parser = subparsers.add_parser("to-fsdp2", description=description)
    else:
        parser = CustomArgumentParser(description=description)

    parser.add_argument("--config_file", type=str, help="The config file to convert to FSDP2", required=True)
    parser.add_argument(
        "--overwrite",
        action="store_true",
        help="Overwrite the config file if it exists",
        default=False,
    )
    parser.add_argument(
        "--output_file",
        type=str,
        help="The path to the output file to write the converted config to. If not provided, the input file will be overwritten (if --overwrite is set)",
        default=None,
    )
    if subparsers is not None:
        parser.set_defaults(func=to_fsdp2_command)

    return parser


def load_config(config_file: str) -> dict:
    with open(config_file) as f:
        config = yaml.safe_load(f)
    if not config:
        raise ValueError("Config file is empty")

    return config


def to_fsdp2_command(args):
    _validate_to_fsdp2_args(args)
    config = load_config(args.config_file)

    if args.overwrite and args.output_file is None:
        args.output_file = args.config_file

    new_config = convert_config_to_fsdp2(config)

    with open(args.output_file, "w") as f:
        yaml.dump(new_config, f)


================================================
FILE: src/accelerate/commands/tpu.py
================================================
#!/usr/bin/env python

# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import os
import subprocess

from packaging.version import Version, parse

from accelerate.commands.config.config_args import default_config_file, load_config_from_file


_description = "Run commands across TPU VMs for initial setup before running `accelerate launch`."


def tpu_command_parser(subparsers=None):
    if subparsers is not None:
        parser = subparsers.add_parser("tpu-config", description=_description)
    else:
        parser = argparse.ArgumentParser("Accelerate tpu-config command", description=_description)
    # Core arguments
    config_args = parser.add_argument_group(
        "Config Arguments", "Arguments that can be configured through `accelerate config`."
    )
    config_args.add_argument(
        "--config_file",
        type=str,
        default=None,
        help="Path to the config file to use for accelerate.",
    )
    config_args.add_argument(
        "--tpu_name",
        default=None,
        help="The name of the TPU to use. If not specified, will use the TPU specified in the config file.",
    )
    config_args.add_argument(
        "--tpu_zone",
        default=None,
        help="The zone of the TPU to use. If not specified, will use the zone specified in the config file.",
    )
    pod_args = parser.add_argument_group("TPU Arguments", "Arguments for options ran inside the TPU.")
    pod_args.add_argument(
        "--use_alpha",
        action="store_true",
        help="Whether to use `gcloud alpha` when running the TPU training script instead of `gcloud`.",
    )
    pod_args.add_argument(
        "--command_file",
        default=None,
        help="The path to the file containing the commands to run on the pod on startup.",
    )
    pod_args.add_argument(
        "--command",
        action="append",
        nargs="+",
        help="A command to run on the pod. Can be passed multiple times.",
    )
    pod_args.add_argument(
        "--install_accelerate",
        action="store_true",
        help="Whether to install accelerate on the pod. Defaults to False.",
    )
    pod_args.add_argument(
        "--accelerate_version",
        default="latest",
        help="The version of accelerate to install on the pod. If not specified, will use the latest pypi version. Specify 'dev' to install from GitHub.",
    )
    pod_args.add_argument(
        "--debug", action="store_true", help="If set, will print the command that would be run instead of running it."
    )

    if subparsers is not None:
        parser.set_defaults(func=tpu_command_launcher)
    return parser


def tpu_command_launcher(args):
    defaults = None

    # Get the default from the config file if it exists.
    if args.config_file is not None or os.path.isfile(default_config_file):
        defaults = load_config_from_file(args.config_file)
        if not args.command_file and defaults.command_file is not None and not args.command:
            args.command_file = defaults.command_file
        if not args.command and defaults.commands is not None:
            args.command = defaults.commands
        if not args.tpu_name:
            args.tpu_name = defaults.tpu_name
        if not args.tpu_zone:
            args.tpu_zone = defaults.tpu_zone
    if args.accelerate_version == "dev":
        args.accelerate_version = "git+https://github.com/huggingface/accelerate.git"
    elif args.accelerate_version == "latest":
        args.accelerate_version = "accelerate -U"
    elif isinstance(parse(args.accelerate_version), Version):
        args.accelerate_version = f"accelerate=={args.accelerate_version}"

    if not args.command_file and not args.command:
        raise ValueError("You must specify either a command file or a command to run on the pod.")

    if args.command_file:
        with open(args.command_file) as f:
            args.command = [f.read().splitlines()]

    # To turn list of lists into list of strings
    if isinstance(args.command[0], list):
        args.command = [line for cmd in args.command for line in cmd]
    # Default to the shared folder and install accelerate
    new_cmd = ["cd /usr/share"]
    if args.install_accelerate:
        new_cmd += [f"pip install {args.accelerate_version}"]
    new_cmd += args.command
    args.command = "; ".join(new_cmd)

    # Then send it to gcloud
    # Eventually try to use google-api-core to do this instead of subprocess
    cmd = ["gcloud"]
    if args.use_alpha:
        cmd += ["alpha"]
    cmd += [
        "compute",
        "tpus",
        "tpu-vm",
        "ssh",
        args.tpu_name,
        "--zone",
        args.tpu_zone,
        "--command",
        args.command,
        "--worker",
        "all",
    ]
    if args.debug:
        print(f"Running {' '.join(cmd)}")
        return
    subprocess.run(cmd)
    print("Successfully setup pod.")


def main():
    parser = tpu_command_parser()
    args = parser.parse_args()

    tpu_command_launcher(args)


================================================
FILE: src/accelerate/commands/utils.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse


class _StoreAction(argparse.Action):
    """
    Custom action that allows for `-` or `_` to be passed in for an argument.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        new_option_strings = []
        for option_string in self.option_strings:
            new_option_strings.append(option_string)
            if "_" in option_string[2:]:
                # Add `-` version to the option string
                new_option_strings.append(option_string.replace("_", "-"))
        self.option_strings = new_option_strings

    def __call__(self, parser, namespace, values, option_string=None):
        setattr(namespace, self.dest, values)
        if not hasattr(namespace, "nondefault"):
            namespace.nondefault = set()
        namespace.nondefault.add(self.dest)


class _StoreConstAction(_StoreAction):
    """
    Same as `argparse._StoreConstAction` but uses the custom `_StoreAction`.
    """

    def __init__(self, option_strings, dest, const, default=None, required=False, help=None):
        super().__init__(
            option_strings=option_strings,
            dest=dest,
            nargs=0,
            const=const,
            default=default,
            required=required,
            help=help,
        )

    def __call__(self, parser, namespace, values, option_string=None):
        super().__call__(parser, namespace, self.const, option_string)


class _StoreTrueAction(_StoreConstAction):
    """
    Same as `argparse._StoreTrueAction` but uses the custom `_StoreConstAction`.
    """

    def __init__(
        self,
        option_strings,
        dest,
        default=None,
        required=False,
        help=None,
    ):
        super().__init__(
            option_strings=option_strings, dest=dest, const=True, default=default, required=required, help=help
        )


class CustomArgumentGroup(argparse._ArgumentGroup):
    """
    Custom argument group that allows for the use of `-` or `_` in arguments passed and overrides the help for each
    when applicable.
    """

    def _add_action(self, action):
        args = vars(action)
        if isinstance(action, argparse._StoreTrueAction):
            action = _StoreTrueAction(
                args["option_strings"], args["dest"], args["default"], args["required"], args["help"]
            )
        elif isinstance(action, argparse._StoreConstAction):
            action = _StoreConstAction(
                args["option_strings"],
                args["dest"],
                args["const"],
                args["default"],
                args["required"],
                args["help"],
            )
        elif isinstance(action, argparse._StoreAction):
            action = _StoreAction(**args)
        action = super()._add_action(action)
        return action


class CustomArgumentParser(argparse.ArgumentParser):
    """
    Custom argument parser that allows for the use of `-` or `_` in arguments passed and overrides the help for each
    when applicable.
    """

    def add_argument(self, *args, **kwargs):
        if "action" in kwargs:
            # Translate action -> class
            if kwargs["action"] == "store_true":
                kwargs["action"] = _StoreTrueAction
        else:
            kwargs["action"] = _StoreAction
        super().add_argument(*args, **kwargs)

    def add_argument_group(self, *args, **kwargs):
        group = CustomArgumentGroup(self, *args, **kwargs)
        self._action_groups.append(group)
        return group


================================================
FILE: src/accelerate/data_loader.py
================================================
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import importlib
import math
from contextlib import suppress
from typing import Callable, Optional, Union

import torch
from packaging import version
from torch.utils.data import BatchSampler, DataLoader, IterableDataset, RandomSampler

from .logging import get_logger
from .state import DistributedType, GradientState, PartialState, is_torch_xla_available
from .utils import (
    RNGType,
    broadcast,
    broadcast_object_list,
    compare_versions,
    concatenate,
    find_batch_size,
    get_data_structure,
    initialize_tensors,
    is_datasets_available,
    is_torch_version,
    is_torchdata_stateful_dataloader_available,
    send_to_device,
    slice_tensors,
    synchronize_rng_states,
)


logger = get_logger(__name__)

# kwargs of the DataLoader in min version 2.0
_PYTORCH_DATALOADER_KWARGS = {
    "batch_size": 1,
    "shuffle": False,
    "sampler": None,
    "batch_sampler": None,
    "num_workers": 0,
    "collate_fn": None,
    "pin_memory": False,
    "drop_last": False,
    "timeout": 0,
    "worker_init_fn": None,
    "multiprocessing_context": None,
    "generator": None,
    "prefetch_factor": 2,
    "persistent_workers": False,
    "pin_memory_device": "",
}

# kwargs added after by version
_PYTORCH_DATALOADER_ADDITIONAL_KWARGS = {"2.6.0": {"in_order": True}}

for v, additional_kwargs in _PYTORCH_DATALOADER_ADDITIONAL_KWARGS.items():
    if is_torch_version(">=", v):
        _PYTORCH_DATALOADER_KWARGS.update(additional_kwargs)


class SeedableRandomSampler(RandomSampler):
    """
    Same as a random sampler, except that in `__iter__` a seed can be used.

    Needed specifically in distributed cases, when the random generator for each GPU needs to start from the same seed
    and be fully reproducible on multiple iterations.

    If a custom `generator` is passed, it will rely on its initial seed as well as the current iteration it is on
    (stored in `self.epoch`).
    """

    def __init__(self, *args, **kwargs):
        data_seed = kwargs.pop("data_seed", None)
        super().__init__(*args, **kwargs)

        self.initial_seed = data_seed if data_seed is not None else torch.random.initial_seed()
        self.epoch = 0

    def __iter__(self):
        if self.generator is None:
            self.generator = torch.Generator(
                device=torch.get_default_device() if hasattr(torch, "get_default_device") else "cpu"
            )
            self.generator.manual_seed(self.initial_seed)

        # Allow `self.epoch` to modify the seed of the generator
        seed = self.epoch + self.initial_seed
        # print("Setting seed at epoch", self.epoch, seed)
        self.generator.manual_seed(seed)
        yield from super().__iter__()
        self.set_epoch(self.epoch + 1)

    def set_epoch(self, epoch: int):
        "Sets the current iteration of the sampler."
        self.epoch = epoch


class BatchSamplerShard(BatchSampler):
    """
    Wraps a PyTorch `BatchSampler` to generate batches for one of the processes only. Instances of this class will
    always yield a number of batches that is a round multiple of `num_processes` and that all have the same size.
    Depending on the value of the `drop_last` attribute of the batch sampler passed, it will either stop the iteration
    at the first batch that would be too small / not present on all processes or loop with indices from the beginning.

    Args:
        batch_sampler (`torch.utils.data.sampler.BatchSampler`):
            The batch sampler to split in several shards.
        num_processes (`int`, *optional*, defaults to 1):
            The number of processes running concurrently.
        process_index (`int`, *optional*, defaults to 0):
            The index of the current process.
        split_batches (`bool`, *optional*, defaults to `False`):
            Whether the shards should be created by splitting a batch to give a piece of it on each process, or by
            yielding different full batches on each process.

            On two processes with a sampler of `[[0, 1, 2, 3], [4, 5, 6, 7]]`, this will result in:

            - the sampler on process 0 to yield `[0, 1, 2, 3]` and the sampler on process 1 to yield `[4, 5, 6, 7]` if
              this argument is set to `False`.
            - the sampler on process 0 to yield `[0, 1]` then `[4, 5]` and the sampler on process 1 to yield `[2, 3]`
              then `[6, 7]` if this argument is set to `True`.
        even_batches (`bool`, *optional*, defaults to `True`):
            Whether or not to loop back at the beginning of the sampler when the number of samples is not a round
            multiple of (original batch size / number of processes).

    <Tip warning={true}>

    `BatchSampler`s with varying batch sizes are not enabled by default. To enable this behaviour, set `even_batches`
    equal to `False`

    </Tip>"""

    def __init__(
        self,
        batch_sampler: BatchSampler,
        num_processes: int = 1,
        process_index: int = 0,
        split_batches: bool = False,
        even_batches: bool = True,
    ):
        if split_batches and batch_sampler.batch_size % num_processes != 0:
            raise ValueError(
                f"To use `BatchSamplerShard` in `split_batches` mode, the batch size ({batch_sampler.batch_size}) "
                f"needs to be a round multiple of the number of processes ({num_processes})."
            )
        self.batch_sampler = batch_sampler
        self.num_processes = num_processes
        self.process_index = process_index
        self.split_batches = split_batches
        self.even_batches = even_batches
        self.batch_size = getattr(batch_sampler, "batch_size", None)
        self.drop_last = getattr(batch_sampler, "drop_last", False)
        if self.batch_size is None and self.even_batches:
            raise ValueError(
                "You need to use `even_batches=False` when the batch sampler has no batch size. If you "
                "are not calling this method directly, set `accelerator.even_batches=False` instead."
            )

    @property
    def total_length(self):
        return len(self.batch_sampler)

    def __len__(self):
        if self.split_batches:
            # Split batches does not change the length of the batch sampler
            return len(self.batch_sampler)
        if len(self.batch_sampler) % self.num_processes == 0:
            # If the length is a round multiple of the number of processes, it's easy.
            return len(self.batch_sampler) // self.num_processes
        length = len(self.batch_sampler) // self.num_processes
        if self.drop_last:
            # Same if we drop the remainder.
            return length
        elif self.even_batches:
            # When we even batches we always get +1
            return length + 1
        else:
            # Otherwise it depends on the process index.
            return length + 1 if self.process_index < len(self.batch_sampler) % self.num_processes else length

    def __iter__(self):
        return self._iter_with_split() if self.split_batches else self._iter_with_no_split()

    def _iter_with_split(self):
        initial_data = []
        batch_length = self.batch_sampler.batch_size // self.num_processes
        for idx, batch in enumerate(self.batch_sampler):
            if idx == 0:
                initial_data = batch
            if len(batch) == self.batch_size:
                # If the batch is full, we yield the part of it this process is responsible of.
                yield batch[batch_length * self.process_index : batch_length * (self.process_index + 1)]

        # If drop_last is True of the last batch was full, iteration is over, otherwise...
        if not self.drop_last and len(initial_data) > 0 and len(batch) < self.batch_size:
            if not self.even_batches:
                if len(batch) > batch_length * self.process_index:
                    yield batch[batch_length * self.process_index : batch_length * (self.process_index + 1)]
            else:
                # For degenerate cases where the dataset has less than num_process * batch_size samples
                while len(initial_data) < self.batch_size:
                    initial_data += initial_data
                batch = batch + initial_data
                yield batch[batch_length * self.process_index : batch_length * (self.process_index + 1)]

    def _iter_with_no_split(self):
        initial_data = []
        batch_to_yield = []
        for idx, batch in enumerate(self.batch_sampler):
            # We gather the initial indices in case we need to circle back at the end.
            if not self.drop_last and idx < self.num_processes:
                initial_data += batch
            # We identify the batch to yield but wait until we ar sure every process gets a full batch before actually
            # yielding it.
            if idx % self.num_processes == self.process_index:
                batch_to_yield = batch
            if idx % self.num_processes == self.num_processes - 1 and (
                self.batch_size is None or len(batch) == self.batch_size
            ):
                yield batch_to_yield
                batch_to_yield = []

        # If drop_last is True, iteration is over, otherwise...
        if not self.drop_last and len(initial_data) > 0:
            if not self.even_batches:
                if len(batch_to_yield) > 0:
                    yield batch_to_yield
            else:
                # ... we yield the complete batch we had saved before if it has the proper length
                if len(batch_to_yield) == self.batch_size:
                    yield batch_to_yield

                # For degenerate cases where the dataset has less than num_process * batch_size samples
                while len(initial_data) < self.num_processes * self.batch_size:
                    initial_data += initial_data

                # If the last batch seen was of the proper size, it has been yielded by its process so we move to the next
                if len(batch) == self.batch_size:
                    batch = []
                    idx += 1

                # Make sure we yield a multiple of self.num_processes batches
                cycle_index = 0
                while idx % self.num_processes != 0 or len(batch) > 0:
                    end_index = cycle_index + self.batch_size - len(batch)
                    batch += initial_data[cycle_index:end_index]
                    if idx % self.num_processes == self.process_index:
                        yield batch
                    cycle_index = end_index
                    batch = []
                    idx += 1


class IterableDatasetShard(IterableDataset):
    """
    Wraps a PyTorch `IterableDataset` to generate samples for one of the processes only. Instances of this class will
    always yield a number of samples that is a round multiple of the actual batch size (depending of the value of
    `split_batches`, this is either `batch_size` or `batch_size x num_processes`). Depending on the value of the
    `drop_last` attribute of the batch sampler passed, it will either stop the iteration at the first batch that would
    be too small or loop with indices from the beginning.

    Args:
        dataset (`torch.utils.data.dataset.IterableDataset`):
            The batch sampler to split in several shards.
        batch_size (`int`, *optional*, defaults to 1):
            The size of the batches per shard (if `split_batches=False`) or the size of the batches (if
            `split_batches=True`).
        drop_last (`bool`, *optional*, defaults to `False`):
            Whether or not to drop the last incomplete batch or complete the last batches by using the samples from the
            beginning.
        num_processes (`int`, *optional*, defaults to 1):
            The number of processes running concurrently.
        process_index (`int`, *optional*, defaults to 0):
            The index of the current process.
        split_batches (`bool`, *optional*, defaults to `False`):
            Whether the shards should be created by splitting a batch to give a piece of it on each process, or by
            yielding different full batches on each process.

            On two processes with an iterable dataset yielding of `[0, 1, 2, 3, 4, 5, 6, 7]`, this will result in:

            - the shard on process 0 to yield `[0, 1, 2, 3]` and the shard on process 1 to yield `[4, 5, 6, 7]` if this
              argument is set to `False`.
            - the shard on process 0 to yield `[0, 1, 4, 5]` and the sampler on process 1 to yield `[2, 3, 6, 7]` if
              this argument is set to `True`.
    """

    def __init__(
        self,
        dataset: IterableDataset,
        batch_size: int = 1,
        drop_last: bool = False,
        num_processes: int = 1,
        process_index: int = 0,
        split_batches: bool = False,
    ):
        if split_batches and batch_size > 1 and batch_size % num_processes != 0:
            raise ValueError(
                f"To use `IterableDatasetShard` in `split_batches` mode, the batch size ({batch_size}) "
                f"needs to be a round multiple of the number of processes ({num_processes})."
            )
        self.dataset: IterableDataset = dataset
        self.batch_size = batch_size
        self.drop_last = drop_last
        self.num_processes = num_processes
        self.process_index = process_index
        self.split_batches = split_batches

    def set_epoch(self, epoch):
        self.epoch = epoch
        if hasattr(self.dataset, "set_epoch"):
            self.dataset.set_epoch(epoch)

    def __len__(self):
        # We will just raise the downstream error if the underlying dataset is not sized
        if self.drop_last:
            return (len(self.dataset) // (self.batch_size * self.num_processes)) * self.batch_size
        else:
            return math.ceil(len(self.dataset) / (self.batch_size * self.num_processes)) * self.batch_size

    def __iter__(self):
        if (
            not hasattr(self.dataset, "set_epoch")
            and hasattr(self.dataset, "generator")
            and isinstance(self.dataset.generator, torch.Generator)
        ):
            self.dataset.generator.manual_seed(self.epoch)
        real_batch_size = self.batch_size if self.split_batches else (self.batch_size * self.num_processes)
        process_batch_size = (self.batch_size // self.num_processes) if self.split_batches else self.batch_size
        process_slice = range(self.process_index * process_batch_size, (self.process_index + 1) * process_batch_size)

        first_batch = None
        current_batch = []
        for element in self.dataset:
            current_batch.append(element)
            # Wait to have a full batch before yielding elements.
            if len(current_batch) == real_batch_size:
                for i in process_slice:
                    yield current_batch[i]
                if first_batch is None:
                    first_batch = current_batch.copy()
                current_batch = []

        # Finished if drop_last is True, otherwise complete the last batch with elements from the beginning.
        if not self.drop_last and len(current_batch) > 0:
            if first_batch is None:
                first_batch = current_batch.copy()
            while len(current_batch) < real_batch_size:
                current_batch += first_batch
            for i in process_slice:
                yield current_batch[i]


class DataLoaderStateMixin:
    """
    Mixin class that adds a state to a `DataLoader` to keep track of the status inside the dataloader such as at the
    end of the iteration, the number of items in the dataset in the last batch relative to the batch size, and other
    useful information that might be needed.

    **Available attributes:**

        - **end_of_dataloader** (`bool`) -- Whether at the last iteration or batch
        - **remainder** (`int`) -- The number of items that are remaining in the last batch, relative to the total
          batch size

    <Tip warning={true}>

        Inheriters of this class should ensure that the class creates a `GradientState()` instance, stored in
        `self.gradient_state`.

    </Tip>

    """

    def __init_subclass__(cls, **kwargs):
        cls.end_of_dataloader = False
        cls.remainder = -1

    def reset(self):
        self.end_of_dataloader = False
        self.remainder = -1

    def begin(self):
        "Prepares the gradient state for the current dataloader"
        self.reset()
        with suppress(Exception):
            if not self._drop_last:
                length = getattr(self.dataset, "total_dataset_length", len(self.dataset))
                self.remainder = length % self.total_batch_size
        self.gradient_state._add_dataloader(self)

    def end(self):
        "Cleans up the gradient state after exiting the dataloader"
        self.gradient_state._remove_dataloader(self)


class DataLoaderAdapter:
    """
    A class which wraps around a PyTorch `DataLoader` (or variants of it) to be used with the `Accelerator`. For
    compatibility reasons, this class inherits from the class it wraps around, so it can be used as a drop-in.
    """

    def __init__(self, dataset, use_stateful_dataloader=False, batch_sampler=None, **kwargs):
        self.use_stateful_dataloader = use_stateful_dataloader
        if is_torchdata_stateful_dataloader_available():
            from torchdata.stateful_dataloader import StatefulDataLoader

        if use_stateful_dataloader and not is_torchdata_stateful_dataloader_available():
            raise ImportError(
                "StatefulDataLoader is not available. Please install torchdata version 0.8.0 or higher to use it."
            )
        if use_stateful_dataloader:
            torchdata_version = version.parse(importlib.metadata.version("torchdata"))
            if (
                "in_order" in kwargs
                and compare_versions(torchdata_version, "<", "0.11")
                and is_torch_version(">=", "2.6.0")
            ):
                kwargs.pop("in_order")
            self.base_dataloader = StatefulDataLoader(dataset, batch_sampler=batch_sampler, **kwargs)
        else:
            self.base_dataloader = DataLoader(dataset, batch_sampler=batch_sampler, **kwargs)

        if hasattr(self.base_dataloader, "state_dict"):
            self.dl_state_dict = self.base_dataloader.state_dict()

    def __getattr__(self, name):
        # Avoid infinite recursion if we try to access a nonexistent base_dataloader attribute.
        if name == "base_dataloader":
            raise AttributeError()
        # Delegate attribute access to the internal dataloader
        return getattr(self.base_dataloader, name)

    def state_dict(self):
        return self.dl_state_dict

    def load_state_dict(self, state_dict):
        self.base_dataloader.load_state_dict(state_dict)

    @property
    def __class__(self):
        """
        In order to maintain backwards compatibility with other code, we need to ensure `isinstance(obj, DataLoader)`
        returns true. This is because some downstream code assumes that the `DataLoader` is the base class of the
        object.
        """
        return self.base_dataloader.__class__

    def __len__(self):
        return len(self.base_dataloader)

    def adjust_state_dict_for_prefetch(self):
        """
        Adjusts the state dict for prefetching. Natively, this will adjust all of the iters yielded keys in
        `self.dl_state_dict` by a factor of `num_processes - 1`, however if a custom correction is needed, this can be
        overridden.

        This should modify `self.dl_state_dict` directly
        """
        # The state dict will be off by a factor of `n-1` batch too many during DDP,
        # so we need to adjust it here
        if PartialState().distributed_type != DistributedType.NO:
            factor = PartialState().num_processes - 1
            # When num_workers > 0, StatefulDataLoader uses _MultiProcessingDataLoaderIter
            # which may not have _sampler_iter_yielded or _num_yielded in its state_dict
            if "_sampler_iter_yielded" in self.dl_state_dict and self.dl_state_dict["_sampler_iter_yielded"] > 0:
                self.dl_state_dict["_sampler_iter_yielded"] -= factor
            if "_num_yielded" in self.dl_state_dict and self.dl_state_dict["_num_yielded"] > 0:
                self.dl_state_dict["_num_yielded"] -= factor
            if self.dl_state_dict.get("_index_sampler_state") is not None:
                if (
                    "samples_yielded" in self.dl_state_dict["_index_sampler_state"]
                    and self.dl_state_dict["_index_sampler_state"]["samples_yielded"] > 0
                ):
                    self.dl_state_dict["_index_sampler_state"]["samples_yielded"] -= self.batch_size * factor

    def _update_state_dict(self):
        # The state_dict of the underlying base_dataloader may be ahead of what is currently being yielded.
        # E.g. the implementation of DataLoaderShard involves having an underlying iterator 1 element ahead of
        # what it wants to yield.
        #
        # _update_state_dict is called to snapshot the state_dict that would properly recover the DataLoaderAdapter.
        if hasattr(self.base_dataloader, "state_dict"):
            self.dl_state_dict = self.base_dataloader.state_dict()
            # Potentially modify the state_dict to adjust for prefetching
            self.adjust_state_dict_for_prefetch()
            # Then tag if we are at the end of the dataloader
            self.dl_state_dict["_iterator_finished"] = self.end_of_dataloader


class DataLoaderShard(DataLoaderAdapter, DataLoaderStateMixin):
    """
    Subclass of `DataLoaderAdapter` that will deal with device placement and current distributed setup.

    Args:
        dataset (`torch.utils.data.dataset.Dataset`):
            The dataset to use to build this dataloader.
        device (`torch.device`, *optional*):
            If passed, the device to put all batches on.
        rng_types (list of `str` or [`~utils.RNGType`]):
            The list of random number generators to synchronize at the beginning of each iteration. Should be one or
            several of:

            - `"torch"`: the base torch random number generator
            - `"cuda"`: the CUDA random number generator (GPU only)
            - `"xla"`: the XLA random number generator (TPU only)
            - `"generator"`: an optional `torch.Generator`
        synchronized_generator (`torch.Generator`, *optional*):
            A random number generator to keep synchronized across processes.
        skip_batches (`int`, *optional*, defaults to 0):
            The number of batches to skip at the beginning.
        use_stateful_dataloader (`bool`, *optional*, defaults to `False`):
            Whether to have this class adapt `StatefulDataLoader` from `torchdata` instead of the regular `DataLoader`.
        **kwargs (additional keyword arguments, *optional*):
            All other keyword arguments to pass to the regular `DataLoader` initialization.

    **Available attributes:**

        - **total_batch_size** (`int`) -- Total batch size of the dataloader across all processes.
            Equal to the original batch size when `split_batches=True`; otherwise the original batch size * the total
            number of processes

        - **total_dataset_length** (`int`) -- Total length of the inner dataset across all processes.
    """

    def __init__(
        self,
        dataset,
        device=None,
        rng_types=None,
        synchronized_generator=None,
        skip_batches=0,
        use_stateful_dataloader=False,
        _drop_last: bool = False,
        _non_blocking: bool = False,
        torch_device_mesh=None,
        **kwargs,
    ):
        super().__init__(dataset, use_stateful_dataloader=use_stateful_dataloader, **kwargs)
        self.device = device
        self.rng_types = rng_types
        self.synchronized_generator = synchronized_generator
        self.skip_batches = skip_batches
        self.gradient_state = GradientState()
        self._drop_last = _drop_last
        self._non_blocking = _non_blocking
        self.iteration = 0

    def adjust_state_dict_for_prefetch(self):
        # DataLoaderShard does not need the DDP prefetch adjustment that DataLoaderDispatcher needs.
        # In DataLoaderShard, each process has its own sharded base dataloader and the 1-batch
        # look-ahead is already accounted for by the timing of _update_state_dict() calls
        # (called before the inner next(), so the captured state already equals the number of
        # batches yielded to the user).
        pass

    def __iter__(self):
        if self.rng_types is not None:
            synchronize_rng_states(self.rng_types, self.synchronized_generator)
        self.begin()

        self.set_epoch(self.iteration)
        dataloader_iter = self.base_dataloader.__iter__()
        # We iterate one batch ahead to check when we are at the end
        try:
            current_batch = next(dataloader_iter)
        except StopIteration:
            self.end()
            return

        batch_index = 0
        while True:
            try:
                # But we still move it to the device so it is done before `StopIteration` is reached
                if self.device is not None:
                    current_batch = send_to_device(current_batch, self.device, non_blocking=self._non_blocking)
                self._update_state_dict()
                next_batch = next(dataloader_iter)
                if batch_index >= self.skip_batches:
                    yield current_batch
                batch_index += 1
                current_batch = next_batch
            except StopIteration:
                self.end_of_dataloader = True
                self._update_state_dict()
                if batch_index >= self.skip_batches:
                    yield current_batch
                break

        self.iteration += 1
        self.end()

    def __reduce__(self):
        """
        Define the `__reduce__` method to ensure a `DataLoaderShard` can be pickled and unpickled. This needs to be
        explicitly defined since default pickling behavior is broken by `DataLoaderAdapter` messing with its
        `__class__` member.
        """
        args = super().__reduce__()
        return (DataLoaderShard, *args[1:])

    def set_epoch(self, epoch: int):
        # In case it is manually passed in, the user can set it to what they like
        if self.iteration != epoch:
            self.iteration = epoch
        if hasattr(self.batch_sampler, "set_epoch"):
            self.batch_sampler.set_epoch(epoch)
        if hasattr(self.batch_sampler, "sampler") and hasattr(self.batch_sampler.sampler, "set_epoch"):
            self.batch_sampler.sampler.set_epoch(epoch)
        if (
            hasattr(self.batch_sampler, "batch_sampler")
            and hasattr(self.batch_sampler.batch_sampler, "sampler")
            and hasattr(self.batch_sampler.batch_sampler.sampler, "set_epoch")
        ):
            self.batch_sampler.batch_sampler.sampler.set_epoch(epoch)
        # We support if a custom `Dataset` implementation has `set_epoch`
        # or in general HF datasets `Datasets`
        elif hasattr(self.dataset, "set_epoch"):
            self.dataset.set_epoch(epoch)

    @property
    def total_batch_size(self):
        batch_sampler = self.sampler if isinstance(self.sampler, BatchSampler) else self.batch_sampler
        return (
            batch_sampler.batch_size
            if getattr(batch_sampler, "split_batches", False)
            else (batch_sampler.batch_size * getattr(batch_sampler, "num_processes", 1))
        )

    @property
    def total_dataset_length(self):
        if hasattr(self.dataset, "total_length"):
            return self.dataset.total_length
        else:
            return len(self.dataset)

    def get_sampler(self):
        return get_sampler(self)

    def set_sampler(self, sampler):
        sampler_is_batch_sampler = isinstance(self.sampler, BatchSampler)
        if sampler_is_batch_sampler:
            self.sampler.sampler = sampler
        else:
            self.batch_sampler.sampler = sampler
            if hasattr(self.batch_sampler, "batch_sampler"):
                self.batch_sampler.batch_sampler.sampler = sampler


if is_torch_xla_available():
    import torch_xla.distributed.parallel_loader as xpl

    class MpDeviceLoaderWrapper(xpl.MpDeviceLoader):
        """
        Wrapper for the xpl.MpDeviceLoader class that knows the total batch size.

        XLA preloading threads will all call DataLoaderShard's __iter__(). Remove rng_types from DataLoaderShard to
        prevent it from using the XLA device in the preloading threads, and synchronize the RNG once from the main
        thread only.

        **Available attributes:**

        - **total_batch_size** (`int`) -- Total batch size of the dataloader across all processes.
            Equal to the original batch size when `split_batches=True`; otherwise the original batch size * the total
            number of processes

        - **total_dataset_length** (`int`) -- Total length of the inner dataset across all processes.
        """

        def __init__(self, dataloader: DataLoaderShard, device: torch.device):
            super().__init__(dataloader, device)
            self._rng_types = self._loader.rng_types
            self._loader.rng_types = None
            self.device = device

        def __iter__(self):
            if self._rng_types is not None:
                synchronize_rng_states(self._rng_types, self._loader.synchronized_generator)

            return super().__iter__()

        def set_epoch(self, epoch: int):
            if hasattr(self.dataloader, "set_epoch"):
                self.dataloader.set_epoch(epoch)

        @property
        def total_batch_size(self):
            return self._loader.total_batch_size

        @property
        def total_dataset_length(self):
            return self._loader.total_dataset_length

        @property
        def batch_sampler(self):
            return self._loader.batch_sampler

        @property
        def dataloader(self):
            return self._loader


class DataLoaderDispatcher(DataLoaderAdapter, DataLoaderStateMixin):
    """
    Subclass of `DataLoaderAdapter` that will iterate and preprocess on process 0 only, then dispatch on each process
    their part of the batch.

    Args:
        split_batches (`bool`, *optional*, defaults to `False`):
            Whether the resulting `DataLoader` should split the batches of the original data loader across devices or
            yield full batches (in which case it will yield batches starting at the `process_index`-th and advancing of
            `num_processes` batches at each iteration). Another way to see this is that the observed batch size will be
            the same as the initial `dataloader` if this option is set to `True`, the batch size of the initial
            `dataloader` multiplied by `num_processes` otherwise. Setting this option to `True` requires that the batch
            size of the `dataloader` is a round multiple of `batch_size`.
        skip_batches (`int`, *optional*, defaults to 0):
            The number of batches to skip at the beginning of an iteration.
        use_stateful_dataloader (`bool`, *optional*, defaults to `False`):
            Whether to have this class adapt `StatefulDataLoader` from `torchdata` instead of the regular `DataLoader`.

    **Available attributes:**

        - **total_batch_size** (`int`) -- Total batch size of the dataloader across all processes.
            Equal to the original batch size when `split_batches=True`; otherwise the original batch size * the total
            number of processes

        - **total_dataset_length** (`int`) -- Total length of the inner dataset across all processes.
    """

    def __init__(
        self,
        dataset,
        split_batches: bool = False,
        skip_batches=0,
        use_stateful_dataloader=False,
        _drop_last: bool = False,
        _non_blocking: bool = False,
        slice_fn=None,
        torch_device_mesh=None,
        **kwargs,
    ):
        shuffle = False
        from torch.utils.data.datapipes.iter.combinatorics import ShufflerIterDataPipe

        # We need to save the shuffling state of the DataPipe
        if isinstance(dataset, ShufflerIterDataPipe):
            shuffle = dataset._shuffle_enabled
        super().__init__(dataset, use_stateful_dataloader=use_stateful_dataloader, **kwargs)
        self.split_batches = split_batches
        if shuffle:
            torch.utils.data.graph_settings.apply_shuffle_settings(dataset, shuffle=shuffle)

        self.gradient_state = GradientState()
        self.state = PartialState()
        self._drop_last = _drop_last
        self._non_blocking = _non_blocking
        self.skip_batches = skip_batches
        self.torch_device_mesh = torch_device_mesh

        self.slice_fn = slice_tensors if slice_fn is None else slice_fn
        self.iteration = 0

        # if a device mesh is provided extract each dimension (dp, fsdp, tp)
        # device mesh may hold any number of dimensions, however,
        # below code is for targeted support for dp, fsdp and tp

        # device mesh will be used only if there is tp involved
        # or any multi-dimensional parallelism involving tp
        # (dp, tp) (fsdp, tp) (dp, fsdp, tp)
        # otherwise the default behaviour not using device mesh should be sufficient
        # since multi dimensional parallelism devoid of tp would anyway need
        # different batches for each process irrespective of dp or fsdp
        self.submesh_tp = None
        self.submesh_dp = None
        self.submesh_fsdp = None
        if self.torch_device_mesh and "tp" in self.torch_device_mesh.mesh_dim_names:
            self.submesh_tp = self.torch_device_mesh["tp"]
            if "dp" in self.torch_device_mesh.mesh_dim_names:
                self.submesh_dp = self.torch_device_mesh["dp"]
            if "fsdp" in self.torch_device_mesh.mesh_dim_names:
                self.submesh_fsdp = self.torch_device_mesh["fsdp"]
        if self.submesh_tp and (self.submesh_dp or self.submesh_fsdp):
            raise ValueError("TP + (DP/FSDP) is not yet supported in dispatch mode")

    def _fetch_batches(self, iterator):
        batches, batch = None, None
        # On process 0, we gather the batch to dispatch.
        if self.state.process_index == 0:
            # Procedure to support TP only is simpler
            # since we want to dispatch the same batch of samples across all ranks
            # this removes complexity of handling multiple tp rank groups when TP + DP
            # combination is involved.

            try:
                # for TP case avoid using split_batches
                # since it would mean that the dataloader should be spilling out
                # duplicates of batches.
                if self.split_batches:
                    # One batch of the main iterator is dispatched and split.
                    if self.submesh_tp:
                        logger.warning(
                            "Use of split_batches for TP would need the dataloader to produce duplicate batches,"
                            "otherwise, use dispatch_batches=True instead."
                        )
                    self._update_state_dict()
                    batch = next(iterator)
                else:
                    # num_processes batches of the main iterator are concatenated then dispatched and split.
                    # We add the batches one by one so we have the remainder available when drop_last=False.
                    batches = []
                    if self.submesh_tp:
                        # when tp, extract single batch and then replicate
                        self._update_state_dict()
                        batch = next(iterator)
                        batches = [batch] * self.state.num_processes
                    else:
                        for _ in range(self.state.num_processes):
                            self._update_state_dict()
                            batches.append(next(iterator))
                    try:
                        batch = concatenate(batches, dim=0)
                    except RuntimeError as e:
                        raise RuntimeError(
                            "You can't use batches of different size with `dispatch_batches=True` or when using an `IterableDataset`."
                            "either pass `dispatch_batches=False` and have each process fetch its own batch "
                            " or pass `split_batches=True`. By doing so, the main process will fetch a full batch and "
                            "slice it into `num_processes` batches for each process."
                        ) from e
                # In both cases, we need to get the structure of the batch that we will broadcast on other
                # processes to initialize the tensors with the right shape.
                # data_structure, stop_iteration
                batch_info = [get_data_structure(batch), False]
            except StopIteration:
                batch_info = [None, True]
        else:
            batch_info = [None, self._stop_iteration]
        # This is inplace, so after this instruction, every process has the same `batch_info` as process 0.
        broadcast_object_list(batch_info)
        self._stop_iteration = batch_info[1]
        if self._stop_iteration:
            # If drop_last is False and split_batches is False, we may have a remainder to take care of.
            if not self.split_batches and not self._drop_last:
                if self.state.process_index == 0 and len(batches) > 0:
                    batch = concatenate(batches, dim=0)
                    batch_info = [get_data_structure(batch), False]
                else:
                    batch_info = [None, True]
                broadcast_object_list(batch_info)
        return batch, batch_info

    def __iter__(self):
        self.begin()
        self.set_epoch(self.iteration)
        main_iterator = None
        if is_torch_version(">=", "2.0.1"):
            # NOTE PyTorch DataLoader adds forward compatibilities for DataPipes, which broadcasts
            # shared seed to all dist processes. Thus, we need to create iterator for all dist processes.
            # But, we only iterate through the DataLoader on process 0.
            main_iterator = self.base_dataloader.__iter__()
        elif self.state.process_index == 0:
            main_iterator = self.base_dataloader.__iter__()
        stop_iteration = False
        self._stop_iteration = False
        first_batch = None
        next_batch, next_batch_info = self._fetch_batches(main_iterator)
        batch_index = 0
        while not stop_iteration:
            batch, batch_info = next_batch, next_batch_info

            if self.state.process_index != 0:
                # Initialize tensors on other processes than process 0.
                batch = initialize_tensors(batch_info[0])
            batch = send_to_device(batch, self.state.device, non_blocking=self._non_blocking)
            # Broadcast the batch before splitting it.
            batch = broadcast(batch, from_process=0)

            if not self._drop_last and first_batch is None:
                # We keep at least num processes elements of the first batch to be able to complete the last batch
                first_batch = self.slice_fn(
                    batch,
                    slice(0, self.state.num_processes),
                    process_index=self.state.process_index,
                    num_processes=self.state.num_processes,
                )

            if batch is None:
                raise ValueError(
                    f"Batch does not contain any data (`{batch}`). At the end of all iterable data available before expected stop iteration."
                )

            observed_batch_size = find_batch_size(batch)
            batch_size = observed_batch_size // self.state.num_processes

            stop_iteration = self._stop_iteration
            if not stop_iteration:
                # We may still be at the end of the dataloader without knowing it yet: if there is nothing left in
                # the dataloader since the number of batches is a round multiple of the number of processes.
                next_batch, next_batch_info = self._fetch_batches(main_iterator)
                # next_batch_info[0] is None when there are no more batches, otherwise we still need to process them.
                if self._stop_iteration and next_batch_info[0] is None:
                    stop_iteration = True

            if not self._drop_last and stop_iteration and observed_batch_size % self.state.num_processes != 0:
                # If the last batch is not complete, let's add the first batch to it.
                batch = concatenate([batch, first_batch], dim=0)
                # Batch size computation above is wrong, it's off by 1 so we fix it.
                batch_size += 1

            data_slice = slice(self.state.process_index * batch_size, (self.state.process_index + 1) * batch_size)
            batch = self.slice_fn(
                batch,
                data_slice,
                process_index=self.state.process_index,
                num_processes=self.state.num_processes,
            )

            if stop_iteration:
                self.end_of_dataloader = True
                self._update_state_dict()
                self.remainder = observed_batch_size
            if batch_index >= self.skip_batches:
                yield batch
            batch_index += 1
        self.iteration += 1
        self.end()

    def set_epoch(self, epoch: int):
        # In case it is manually passed in, the user can set it to what they like
        if self.iteration != epoch:
            self.iteration = epoch
        if hasattr(self.batch_sampler, "sampler") and hasattr(self.batch_sampler.sampler, "set_epoch"):
            self.batch_sampler.sampler.set_epoch(epoch)
        elif hasattr(self.dataset, "set_epoch"):
            self.dataset.set_epoch(epoch)

    def __len__(self):
        whole_length = len(self.base_dataloader)
        if self.split_batches:
            return whole_length
        elif self._drop_last:
            return whole_length // self.state.num_processes
        else:
            return math.ceil(whole_length / self.state.num_processes)

    def __reduce__(self):
        """
        Define the `__reduce__` method to ensure a `DataLoaderDispatcher` can be pickled and unpickled. This needs to
        be explicitly defined since default pickling behavior is broken by `DataLoaderAdapter` messing with its
        `__class__` member.
        """
        args = super().__reduce__()
        return (DataLoaderDispatcher, *args[1:])

    @property
    def total_batch_size(self):
        return (
            self.dataset.batch_size if self.split_batches else (self.dataset.batch_size * self.dataset.num_processes)
        )

    @property
    def total_dataset_length(self):
        return len(self.dataset)

    def get_sampler(self):
        return get_sampler(self)

    def set_sampler(self, sampler):
        sampler_is_batch_sampler = isinstance(self.sampler, BatchSampler)
        if sampler_is_batch_sampler:
            self.sampler.sampler = sampler
        else:
            self.batch_sampler.sampler = sampler
            if hasattr(self.batch_sampler, "batch_sampler"):
                self.batch_sampler.batch_sampler.sampler = sampler


def get_sampler(dataloader):
    """
    Get the sampler associated to the dataloader

    Args:
        dataloader (`torch.utils.data.dataloader.DataLoader`):
            The data loader to split across several devices.
    Returns:
        `torch.utils.data.Sampler`: The sampler associated to the dataloader
    """
    sampler_is_batch_sampler = isinstance(dataloader.sampler, BatchSampler)
    if sampler_is_batch_sampler:
        sampler = getattr(dataloader.sampler, "sampler", None)
    else:
        sampler = getattr(dataloader.batch_sampler, "sampler", None)
    return sampler


def prepare_data_loader(
    dataloader: DataLoader,
    device: Optional[torch.device] = None,
    num_processes: Optional[int] = None,
    process_index: Optional[int] = None,
    split_batches: bool = False,
    put_on_device: bool = False,
    rng_types: Optional[list[Union[str, RNGType]]] = None,
    dispatch_batches: Optional[bool] = None,
    even_batches: bool = True,
    slice_fn_for_dispatch: Optional[Callable] = None,
    use_seedable_sampler: bool = False,
    data_seed: Optional[int] = None,
    non_blocking: bool = False,
    use_stateful_dataloader: bool = False,
    torch_device_mesh=None,
) -> DataLoader:
    """
    Wraps a PyTorch `DataLoader` to generate batches for one of the processes only.

    Depending on the value of the `drop_last` attribute of the `dataloader` passed, it will either stop the iteration
    at the first batch that would be too small / not present on all processes or loop with indices from the beginning.

    Args:
        dataloader (`torch.utils.data.dataloader.DataLoader`):
            The data loader to split across several devices.
        device (`torch.device`):
            The target device for the returned `DataLoader`.
        num_processes (`int`, *optional*):
            The number of processes running concurrently. Will default to the value given by [`~state.PartialState`].
        process_index (`int`, *optional*):
            The index of the current process. Will default to the value given by [`~state.PartialState`].
        split_batches (`bool`, *optional*, defaults to `False`):
            Whether the resulting `DataLoader` should split the batches of the original data loader across devices or
            yield full batches (in which case it will yield batches starting at the `process_index`-th and advancing of
            `num_processes` batches at each iteration).

            Another way to see this is that the observed batch size will be the same as the initial `dataloader` if
            this option is set to `True`, the batch size of the initial `dataloader` multiplied by `num_processes`
            otherwise.

            Setting this option to `True` requires that the batch size of the `dataloader` is a round multiple of
            `batch_size`.
        put_on_device (`bool`, *optional*, defaults to `False`):
            Whether or not to put the batches on `device` (only works if the batches are nested list, tuples or
            dictionaries of tensors).
        rng_types (list of `str` or [`~utils.RNGType`]):
            The list of random number generators to synchronize at the beginning of each iteration. Should be one or
            several of:

            - `"torch"`: the base torch random number generator
            - `"cuda"`: the CUDA random number generator (GPU only)
            - `"xla"`: the XLA random number generator (TPU only)
            - `"generator"`: the `torch.Generator` of the sampler (or batch sampler if there is no sampler in your
              dataloader) or of the iterable dataset (if it exists) if the underlying dataset is of that type.

        dispatch_batches (`bool`, *optional*):
            If set to `True`, the dataloader prepared is only iterated through on the main process and then the batches
            are split and broadcast to each process. Will default to `True` when the underlying dataset is an
            `IterableDataset`, `False` otherwise.
        even_batches (`bool`, *optional*, defaults to `True`):
            If set to `True`, in cases where the total batch size across all processes does not exactly divide the
            dataset, samples at the start of the dataset will be duplicated so the batch can be divided equally among
            all workers.
        slice_fn_for_dispatch (`Callable`, *optional*`):
            If passed, this function will be used to slice tensors across `num_processes`. Will default to
            [`~utils.slice_tensors`]. This argument is used only when `dispatch_batches` is set to `True` and will be
            ignored otherwise.
        use_seedable_sampler (`bool`, *optional*, defaults to `False`):
            Whether to use the [`~data_loader.SeedableRandomSampler`] instead of a `RandomSampler` for better
            reproducibility. Comes at a cost of potentially different performances due to different shuffling
            algorithms but ensures results will be the *exact* same. Should be paired with `set_seed()` at every
            `self.set_epoch`
        data_seed (`int`, *optional*, defaults to `None`):
            The seed to use for the underlying generator when using `use_seedable_sampler`. If `None`, the generator
            will use the current default seed from torch.
        non_blocking (`bool`, *optional*, defaults to `False`):
            If set to `True`, dataloader will utilize non-blocking host-to-device transfers. If the dataloader has
            `pin_memory` set to `True`, this will help to increase overlap between data transfer and computations.
        use_stateful_dataloader (`bool`, *optional*, defaults to `False`):
            "If set to true, the dataloader prepared by the Accelerator will be backed by "
            "[torchdata.StatefulDataLoader](https://github.com/pytorch/data/tree/main/torchdata/stateful_dataloader).
            This requires `torchdata` version 0.8.0 or higher that supports StatefulDataLoader to be installed."
        torch_device_mesh (`torch.distributed.DeviceMesh`, *optional*, defaults to `None`):
            PyTorch device mesh.


    Returns:
        `torch.utils.data.dataloader.DataLoader`: A new data loader that will yield the portion of the batches

    <Tip warning={true}>

    `BatchSampler`s with varying batch sizes are not enabled by default. To enable this behaviour, set `even_batches`
    equal to `False`

    </Tip>
    """
    if dispatch_batches is None:
        if not put_on_device:
            dispatch_batches = False
        else:
            dispatch_batches = isinstance(dataloader.dataset, IterableDataset)

    if dispatch_batches and not put_on_device:
        raise ValueError("Using `dispatch_batches=True` requires `put_on_device=True`.")
    # Grab defaults from PartialState
    state = PartialState()
    if num_processes is None:
        num_processes = state.num_processes

    if process_index is None:
        process_index = state.process_index

    if torch_device_mesh:
        if state.distributed_type == DistributedType.DEEPSPEED:
            # In DeepSpeed, the optimizer sharing level in DP is determined by the config file.
            # Only considers "dp" and "tp".
            # Given a device mesh (dp, tp) = (2, 3):
            # - From the data parallel perspective, ranks should be structured as: 0 0 0 1 1 1
            # - Processes with the same DP rank will receive the same batch.
            submesh_tp_size = 1
            if "tp" in torch_device_mesh.mesh_dim_names:
                submesh_tp_size = torch_device_mesh["tp"].size()
            process_index = process_index // submesh_tp_size
            num_processes = num_processes // submesh_tp_size
        else:
            # when device mesh is used, specifically with TP
            # then there is need to update process_index and num_processes
            # to bring in the effect of generating same batch across TP ranks
            # and different batch across FSDP and DP ranks.
            # Example:
            # if device mesh is (dp,fsdp,tp) = (2, 2, 3)
            # ranks would range from 0...11
            # from data angle ranks should look like 0 0 0 1 1 1 2 2 2 3 3 3
            # processes with same ranks/ids would receive the same batch
            # for CP the same as TP applies
            submesh_fsdp_size = 1
            submesh_dp_size = 1
            submesh_tp_size = 1
            submesh_cp_size = 1
            if "tp" in torch_device_mesh.mesh_dim_names:
                submesh_tp_size = torch_device_mesh["tp"].size()
            if "cp" in torch_device_mesh.mesh_dim_names:
                submesh_cp_size = torch_device_mesh["cp"].size()
            if "dp_replicate" in torch_device_mesh.mesh_dim_names:
                submesh_dp_size = torch_device_mesh["dp_replicate"].size()
            if "dp_shard" in torch_device_mesh.mesh_dim_names:
                submesh_fsdp_size = torch_device_mesh["dp_shard"].size()
            process_index = process_index // (submesh_tp_size * submesh_cp_size)
            num_processes = submesh_fsdp_size * submesh_dp_size

    # Sanity check
    if split_batches:
        if dataloader.batch_size is not None:
            batch_size_for_check = dataloader.batch_size
        else:
            # For custom batch_sampler
            if hasattr(dataloader.batch_sampler, "batch_size"):
                batch_size_for_check = dataloader.batch_sampler.batch_size
            else:
                raise ValueError(
                    "In order to use `split_batches==True` you must have a `batch_size` attribute either in the passed "
                    "`dataloader` or `dataloader.batch_sampler` objects, and it has to return a natural number. "
                    "Your `dataloader.batch_size` is None and `dataloader.batch_sampler` "
                    f"(`{type(dataloader.batch_sampler)}`) does not have the `batch_size` attribute set."
                )

        if batch_size_for_check > 1 and batch_size_for_check % num_processes != 0:
            raise ValueError(
                f"To use a `DataLoader` in `split_batches` mode, the batch size ({dataloader.batch_size}) "
                f"needs to be a round multiple of the number of processes ({num_processes})."
            )

    new_dataset = dataloader.dataset
    # Iterable dataset doesn't like batch_sampler, but data_loader creates a default one for it
    new_batch_sampler = dataloader.batch_sampler if not isinstance(new_dataset, IterableDataset) else None
    sampler_is_batch_sampler = isinstance(dataloader.sampler, BatchSampler)
    synchronized_generator = None

    sampler = get_sampler(dataloader)
    if isinstance(sampler, RandomSampler) and use_seedable_sampler:
        # When iterating through the dataloader during distributed processes
        # we want to ensure that on each process we are iterating through the same
        # samples in the same order if a seed is set. This requires a tweak
        # to the `torch.utils.data.RandomSampler` class (if used).
        sampler = SeedableRandomSampler(
            data_source=sampler.data_source,
            replacement=sampler.replacement,
            num_samples=sampler._num_samples,
            generator=getattr(
                sampler,
                "generator",
                torch.Generator(device=torch.get_default_device() if hasattr(torch, "get_default_device") else "cpu"),
            ),
            data_seed=data_seed,
        )

    if isinstance(dataloader.sampler, RandomSampler) and state.distributed_type == DistributedType.XLA:
        # isinstance(dataloader.sampler, RandomSampler) indicates the original dataloader has `shuffle` enabled.
        generator = torch.Generator(
            device=torch.get_default_device() if hasattr(torch, "get_default_device") else "cpu"
        )
        seed = int(torch.empty((), dtype=torch.int64).random_().item())
        generator.manual_seed(seed)
        dataloader.generator = generator
        dataloader.sampler.generator = generator
    # No change if no multiprocess
    if (num_processes != 1 or state.distributed_type == DistributedType.MEGATRON_LM) and not dispatch_batches:
        if is_datasets_available():
            from datasets import IterableDataset as DatasetsIterableDataset
        if (
            is_datasets_available()
            and isinstance(new_dataset, DatasetsIterableDataset)
            and not split_batches
            and new_dataset.n_shards >= num_processes
        ):
            new_dataset = new_dataset.shard(num_shards=num_processes, index=process_index)
        elif isinstance(new_dataset, IterableDataset):
            if getattr(dataloader.dataset, "generator", None) is not None:
                synchronized_generator = dataloader.dataset.generator
            new_dataset = IterableDatasetShard(
                new_dataset,
                batch_size=dataloader.batch_size,
                drop_last=dataloader.drop_last,
                num_processes=num_processes,
                process_index=process_index,
                split_batches=split_batches,
            )
        else:
            if not use_seedable_sampler and hasattr(sampler, "generator"):
                if sampler.generator is None:
                    sampler.generator = torch.Generator(
                        device=torch.get_default_device() if hasattr(torch, "get_default_device") else "cpu"
                    )
                    seed = int(torch.empty((), dtype=torch.int64).random_().item())
                    sampler.generator.manual_seed(seed)
                synchronized_generator = sampler.generator
            batch_sampler = dataloader.sampler if sampler_is_batch_sampler else dataloader.batch_sampler
            new_batch_sampler = BatchSamplerShard(
                batch_sampler,
                num_processes=num_processes,
                process_index=process_index,
                split_batches=split_batches,
                even_batches=even_batches,
            )

    # We ignore all of those since they are all dealt with by our new_batch_sampler
    ignore_kwargs = [
        "batch_size",
        "shuffle",
        "sampler",
        "batch_sampler",
        "drop_last",
    ]

    if rng_types is not None and synchronized_generator is None and "generator" in rng_types:
        rng_types.remove("generator")

    kwargs = {
        k: getattr(dataloader, k, _PYTORCH_DATALOADER_KWARGS[k])
        for k in _PYTORCH_DATALOADER_KWARGS
        if k not in ignore_kwargs
    }

    # Need to provide batch_size as batch_sampler is None for Iterable dataset
    if new_batch_sampler is None:
        kwargs["drop_last"] = dataloader.drop_last
        kwargs["batch_size"] = (
            dataloader.batch_size // num_processes if split_batches and not dispatch_batches else dataloader.batch_size
        )
    if dispatch_batches:
        kwargs.pop("generator")
        dataloader = DataLoaderDispatcher(
            new_dataset,
            split_batches=split_batches,
            batch_sampler=new_batch_sampler,
            _drop_last=dataloader.drop_last,
            _non_blocking=non_blocking,
            slice_fn=slice_fn_for_dispatch,
            use_stateful_dataloader=use_stateful_dataloader,
            torch_device_mesh=torch_device_mesh,
            **kwargs,
        )
    elif sampler_is_batch_sampler:
        dataloader = DataLoaderShard(
            new_dataset,
            device=device if put_on_device and state.distributed_type != DistributedType.XLA else None,
            sampler=new_batch_sampler,
            batch_size=dataloader.batch_size,
            rng_types=rng_types,
            _drop_last=dataloader.drop_last,
            _non_blocking=non_blocking,
            synchronized_generator=synchronized_generator,
            use_stateful_dataloader=use_stateful_dataloader,
            **kwargs,
        )
    else:
        dataloader = DataLoaderShard(
            new_dataset,
            device=device if put_on_device and state.distributed_type != DistributedType.XLA else None,
            batch_sampler=new_batch_sampler,
            rng_types=rng_types,
            synchronized_generator=synchronized_generator,
            _drop_last=dataloader.drop_last,
            _non_blocking=non_blocking,
            use_stateful_dataloader=use_stateful_dataloader,
            **kwargs,
        )

    if isinstance(sampler, SeedableRandomSampler) and use_seedable_sampler:
        dataloader.set_sampler(sampler)
    if state.distributed_type == DistributedType.XLA:
        return MpDeviceLoaderWrapper(dataloader, device)
    return dataloader


class SkipBatchSampler(BatchSampler):
    """
    A `torch.utils.data.BatchSampler` that skips the first `n` batches of another `torch.utils.data.BatchSampler`.
    Should not be used if the original dataloader is a `StatefulDataLoader`.
    """

    def __init__(self, batch_sampler, skip_batches=0):
        self.batch_sampler = batch_sampler
        self.skip_batches = skip_batches

    def __iter__(self):
        for index, samples in enumerate(self.batch_sampler):
            if index >= self.skip_batches:
                yield samples

    @property
    def total_length(self):
        return len(self.batch_sampler)

    def __len__(self):
        return len(self.batch_sampler) - self.skip_batches


class SkipDataLoader(DataLoaderAdapter, DataLoaderStateMixin):
    """
    Subclass of a PyTorch `DataLoader` that will skip the first batches. Generally it's preferable to use
    `skip_first_batches`/`torchdata.StatefulDataLoader` instead of this class.

    Args:
        dataset (`torch.utils.data.dataset.Dataset`):
            The dataset to use to build this dataloader.
        skip_batches (`int`, *optional*, defaults to 0):
            The number of batches to skip at the beginning.
        kwargs:
            All other keyword arguments to pass to the regular `DataLoader` initialization.
    """

    def __init__(self, dataset, skip_batches=0, use_stateful_dataloader=False, **kwargs):
        super().__init__(dataset, use_stateful_dataloader=use_stateful_dataloader, **kwargs)
        self.skip_batches = skip_batches
        self.gradient_state = GradientState()

    def __iter__(self):
        self.begin()
        for index, batch in enumerate(self.base_dataloader.__iter__()):
            if index >= self.skip_batches:
                self._update_state_dict()
                yield batch
        self.end()

    def __len__(self):
        return len(self.base_dataloader) - self.skip_batches

    def __reduce__(self):
        """
        Define the `__reduce__` method to ensure a `SkipDataLoader` can be pickled and unpickled. This needs to be
        explicitly defined since default pickling behavior is broken by `DataLoaderAdapter` messing with its
        `__class__` member.
        """
        args = super().__reduce__()
        return (SkipDataLoader, *args[1:])


def skip_first_batches(dataloader, num_batches=0):
    """
    Creates a `torch.utils.data.DataLoader` that will efficiently skip the first `num_batches`. Should not be used if
    the original dataloader is a `StatefulDataLoader`.
    """
    state = PartialState()
    if state.distributed_type == DistributedType.XLA:
        device = dataloader.device
        dataloader = dataloader.dataloader

    dataset = dataloader.dataset
    sampler_is_batch_sampler = False
    if isinstance(dataset, IterableDataset):
        new_batch_sampler = None
    else:
        sampler_is_batch_sampler = isinstance(dataloader.sampler, BatchSampler)
        batch_sampler = dataloader.sampler if sampler_is_batch_sampler else dataloader.batch_sampler
        new_batch_sampler = SkipBatchSampler(batch_sampler, skip_batches=num_batches)

    # We ignore all of those since they are all dealt with by our new_batch_sampler
    ignore_kwargs = [
        "batch_size",
        "shuffle",
        "sampler",
        "batch_sampler",
        "drop_last",
    ]

    kwargs = {
        k: getattr(dataloader, k, _PYTORCH_DATALOADER_KWARGS[k])
        for k in _PYTORCH_DATALOADER_KWARGS
        if k not in ignore_kwargs
    }

    # Need to provide batch_size as batch_sampler is None for Iterable dataset
    if new_batch_sampler is None:
        kwargs["drop_last"] = dataloader.drop_last
        kwargs["batch_size"] = dataloader.batch_size

    if isinstance(dataloader, DataLoaderDispatcher):
        if new_batch_sampler is None:
            # Need to manually skip batches in the dataloader
            kwargs["skip_batches"] = num_batches
        dataloader = DataLoaderDispatcher(
            dataset,
            split_batches=dataloader.split_batches,
            batch_sampler=new_batch_sampler,
            _drop_last=dataloader._drop_last,
            **kwargs,
        )
    elif isinstance(dataloader, DataLoaderShard):
        if new_batch_sampler is None:
            # Need to manually skip batches in the dataloader
            kwargs["skip_batches"] = num_batches
        elif sampler_is_batch_sampler:
            kwargs["sampler"] = new_batch_sampler
            kwargs["batch_size"] = dataloader.batch_size
        else:
            kwargs["batch_sampler"] = new_batch_sampler
        dataloader = DataLoaderShard(
            dataset,
            device=dataloader.device,
            rng_types=dataloader.rng_types,
            synchronized_generator=dataloader.synchronized_generator,
            **kwargs,
        )
    else:
        if new_batch_sampler is None:
            # Need to manually skip batches in the dataloader
            dataloader = SkipDataLoader(dataset, skip_batches=num_batches, **kwargs)
        else:
            dataloader = DataLoader(dataset, batch_sampler=new_batch_sampler, **kwargs)

    if state.distributed_type == DistributedType.XLA:
        dataloader = MpDeviceLoaderWrapper(dataloader, device)

    return dataloader


================================================
FILE: src/accelerate/hooks.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import functools
from collections.abc import Mapping
from typing import Optional, Union

import torch
import torch.nn as nn

from .state import PartialState
from .utils import (
    PrefixedDataset,
    find_device,
    named_module_tensors,
    send_to_device,
    set_module_tensor_to_device,
)
from .utils.imports import (
    is_mlu_available,
    is_musa_available,
    is_npu_available,
)
from .utils.memory import clear_device_cache
from .utils.modeling import get_non_persistent_buffers
from .utils.other import recursive_getattr


def _compiler_disable(fn):
    """
    Lazy version of `torch.compiler.disable` that avoids importing `torch._dynamo` at decoration time.
    `torch.compiler.disable` eagerly imports `torch._dynamo` which adds ~4s to import time.
    """

    @functools.wraps(fn)
    def wrapper(*args, **kwargs):
        if not hasattr(wrapper, "_compiled_fn"):
            wrapper._compiled_fn = torch.compiler.disable(fn)
        return wrapper._compiled_fn(*args, **kwargs)

    return wrapper


_accelerate_added_attributes = ["to", "cuda", "npu", "xpu", "mlu", "sdaa", "musa"]


class ModelHook:
    """
    A hook that contains callbacks to be executed just before and after the forward method of a model. The difference
    with PyTorch existing hooks is that they get passed along the kwargs.

    Class attribute:
    - **no_grad** (`bool`, *optional*, defaults to `False`) -- Whether or not to execute the actual forward pass under
      the `torch.no_grad()` context manager.
    """

    no_grad = False

    def init_hook(self, module):
        """
        To be executed when the hook is attached to the module.

        Args:
            module (`torch.nn.Module`): The module attached to this hook.
        """
        return module

    def pre_forward(self, module, *args, **kwargs):
        """
        To be executed just before the forward method of the model.

        Args:
            module (`torch.nn.Module`): The module whose forward pass will be executed just after this event.
            args (`Tuple[Any]`): The positional arguments passed to the module.
            kwargs (`Dict[Str, Any]`): The keyword arguments passed to the module.

        Returns:
            `Tuple[Tuple[Any], Dict[Str, Any]]`: A tuple with the treated `args` and `kwargs`.
        """
        return args, kwargs

    def post_forward(self, module, output):
        """
        To be executed just after the forward method of the model.

        Args:
            module (`torch.nn.Module`): The module whose forward pass been executed just before this event.
            output (`Any`): The output of the module.

        Returns:
            `Any`: The processed `output`.
        """
        return output

    def detach_hook(self, module):
        """
        To be executed when the hook is detached from a module.

        Args:
            module (`torch.nn.Module`): The module detached from this hook.
        """
        return module


class SequentialHook(ModelHook):
    """
    A hook that can contain several hooks and iterates through them at each event.
    """

    def __init__(self, *hooks):
        self.hooks = hooks

    def init_hook(self, module):
        for hook in self.hooks:
            module = hook.init_hook(module)
        return module

    @_compiler_disable
    def pre_forward(self, module, *args, **kwargs):
        for hook in self.hooks:
            args, kwargs = hook.pre_forward(module, *args, **kwargs)
        return args, kwargs

    @_compiler_disable
    def post_forward(self, module, output):
        for hook in self.hooks:
            output = hook.post_forward(module, output)
        return output

    def detach_hook(self, module):
        for hook in self.hooks:
            module = hook.detach_hook(module)
        return module


def add_hook_to_module(module: nn.Module, hook: ModelHook, append: bool = False):
    """
    Adds a hook to a given module. This will rewrite the `forward` method of the module to include the hook, to remove
    this behavior and restore the original `forward` method, use `remove_hook_from_module`.

    <Tip warning={true}>

    If the module already contains a hook, this will replace it with the new hook passed by default. To chain two hooks
    together, pass `append=True`, so it chains the current and new hook into an instance of the `SequentialHook` class.

    </Tip>

    Args:
        module (`torch.nn.Module`):
            The module to attach a hook to.
        hook (`ModelHook`):
            The hook to attach.
        append (`bool`, *optional*, defaults to `False`):
            Whether the hook should be chained with an existing one (if module already contains a hook) or not.

    Returns:
        `torch.nn.Module`: The same module, with the hook attached (the module is modified in place, so the result can
        be discarded).
    """
    if append and (getattr(module, "_hf_hook", None) is not None):
        old_hook = module._hf_hook
        remove_hook_from_module(module)
        hook = SequentialHook(old_hook, hook)

    if hasattr(module, "_hf_hook") and hasattr(module, "_old_forward"):
        # If we already put some hook on this module, we replace it with the new one.
        old_forward = module._old_forward
    else:
        old_forward = module.forward
        module._old_forward = old_forward

    module = hook.init_hook(module)
    module._hf_hook = hook

    def new_forward(module, *args, **kwargs):
        args, kwargs = module._hf_hook.pre_forward(module, *args, **kwargs)
        if module._hf_hook.no_grad:
            with torch.no_grad():
                output = module._old_forward(*args, **kwargs)
        else:
            output = module._old_forward(*args, **kwargs)
        return module._hf_hook.post_forward(module, output)

    # Overriding a GraphModuleImpl forward freezes the forward call and later modifications on the graph will fail.
    # Reference: https://pytorch.slack.com/archives/C3PDTEV8E/p1705929610405409
    if "GraphModuleImpl" in str(type(module)):
        module.__class__.forward = functools.update_wrapper(functools.partial(new_forward, module), old_forward)
    else:
        module.forward = functools.update_wrapper(functools.partial(new_forward, module), old_forward)

    return module


def remove_hook_from_module(module: nn.Module, recurse=False):
    """
    Removes any hook attached to a module via `add_hook_to_module`.

    Args:
        module (`torch.nn.Module`): The module to attach a hook to.
        recurse (`bool`, **optional**): Whether to remove the hooks recursively

    Returns:
        `torch.nn.Module`: The same module, with the hook detached (the module is modified in place, so the result can
        be discarded).
    """

    if hasattr(module, "_hf_hook"):
        module._hf_hook.detach_hook(module)
        delattr(module, "_hf_hook")

    if hasattr(module, "_old_forward"):
        # Overriding a GraphModuleImpl forward freezes the forward call and later modifications on the graph will fail.
        # Reference: https://pytorch.slack.com/archives/C3PDTEV8E/p1705929610405409
        if "GraphModuleImpl" in str(type(module)):
            module.__class__.forward = module._old_forward
        else:
            module.forward = module._old_forward
        delattr(module, "_old_forward")

    # Remove accelerate added warning hooks from dispatch_model
    for attr in _accelerate_added_attributes:
        module.__dict__.pop(attr, None)

    if recurse:
        for child in module.children():
            remove_hook_from_module(child, recurse)

    return module


class AlignDevicesHook(ModelHook):
    """
    A generic `ModelHook` that ensures inputs and model weights are on the same device for the forward pass of the
    associated module, potentially offloading the weights after the forward pass.

    Args:
        execution_device (`torch.device`, *optional*):
            The device on which inputs and model weights should be placed before the forward pass.
        offload (`bool`, *optional*, defaults to `False`):
            Whether or not the weights should be offloaded after the forward pass.
        io_same_device (`bool`, *optional*, defaults to `False`):
            Whether or not the output should be placed on the same device as the input was.
        weights_map (`Mapping[str, torch.Tensor]`, *optional*):
            When the model weights are offloaded, a (potentially lazy) map from param names to the tensor values.
        offload_buffers (`bool`, *optional*, defaults to `False`):
            Whether or not to include the associated module's buffers when offloading.
        place_submodules (`bool`, *optional*, defaults to `False`):
            Whether to place the submodules on `execution_device` during the `init_hook` event.
    """

    def __init__(
        self,
        execution_device: Optional[Union[int, str, torch.device]] = None,
        offload: bool = False,
        io_same_device: bool = False,
        weights_map: Optional[Mapping] = None,
        offload_buffers: bool = False,
        place_submodules: bool = False,
        skip_keys: Optional[Union[str, list[str]]] = None,
        tied_params_map: Optional[dict[int, dict[torch.device, torch.Tensor]]] = None,
    ):
        self.execution_device = execution_device
        self.offload = offload
        self.io_same_device = io_same_device
        self.weights_map = weights_map
        self.offload_buffers = offload_buffers
        self.place_submodules = place_submodules
        self.skip_keys = skip_keys

        # Will contain the input device when `io_same_device=True`.
        self.input_device = None
        self.param_original_devices = {}
        self.buffer_original_devices = {}
        self.tied_params_names = set()

        # The hook pre_forward/post_forward need to have knowledge of this dictionary, as with offloading we want to avoid duplicating memory
        # for tied weights already loaded on the target execution device.
        self.tied_params_map = tied_params_map

    def __repr__(self):
        return (
            f"AlignDevicesHook(execution_device={self.execution_device}, offload={self.offload}, "
            f"io_same_device={self.io_same_device}, offload_buffers={self.offload_buffers}, "
            f"place_submodules={self.place_submodules}, skip_keys={repr(self.skip_keys)})"
        )

    def init_hook(self, module):
        # In case the AlignDevicesHook is on meta device, ignore tied weights as data_ptr() is then always zero.
        if self.execution_device == "meta" or self.execution_device == torch.device("meta"):
            self.tied_params_map = None

        if not self.offload and self.execution_device is not None:
            for name, _ in named_module_tensors(module, recurse=self.place_submodules):
                set_module_tensor_to_device(module, name, self.execution_device, tied_params_map=self.tied_params_map)
        elif self.offload:
            self.original_devices = {
                name: param.device for name, param in named_module_tensors(module, recurse=self.place_submodules)
            }
            if self.weights_map is None:
                self.weights_map = {
                    name: param.to("cpu")
                    for name, param in named_module_tensors(
                        module, include_buffers=self.offload_buffers, recurse=self.place_submodules
                    )
                }
            for name, _ in named_module_tensors(
                module, include_buffers=self.offload_buffers, recurse=self.place_submodules, remove_non_persistent=True
            ):
                # When using disk offloading, we can not rely on `weights_map[name].data_ptr()` as the reference pointer,
                # as we have no guarantee that safetensors' `file.get_tensor()` will always give the same pointer.
                # As we have no reliable way to track the shared data pointer of tied weights in this case, we use tied_params_names: List[str]
                # to add on the fly pointers to `tied_params_map` in the pre_forward call.
                if (
                    self.tied_params_map is not None
                    and recursive_getattr(module, name).data_ptr() in self.tied_params_map
                ):
                    self.tied_params_names.add(name)

                set_module_tensor_to_device(module, name, "meta")

            if not self.offload_buffers and self.execution_device is not None:
                for name, _ in module.named_buffers(recurse=self.place_submodules):
                    set_module_tensor_to_device(
                        module, name, self.execution_device, tied_params_map=self.tied_params_map
                    )
            elif self.offload_buffers and self.execution_device is not None:
                for name in get_non_persistent_buffers(module, recurse=self.place_submodules):
                    set_module_tensor_to_device(
                        module, name, self.execution_device, tied_params_map=self.tied_params_map
                    )

        return module

    @_compiler_disable
    def pre_forward(self, module, *args, **kwargs):
        if self.io_same_device:
            self.input_device = find_device([args, kwargs])
        if self.offload:
            self.tied_pointers_to_remove = set()

            for name, _ in named_module_tensors(
                module,
                include_buffers=self.offload_buffers,
                recurse=self.place_submodules,
                remove_non_persistent=True,
            ):
                fp16_statistics = None
                value = self.weights_map[name]
                if "weight" in name and name.replace("weight", "SCB") in self.weights_map.keys():
                    if value.dtype == torch.int8:
                        fp16_statistics = self.weights_map[name.replace("weight", "SCB")]

                # In case we are using offloading with tied weights, we need to keep track of the offloaded weights
                # that are loaded on device at this point, as we will need to remove them as well from the dictionary
                # self.tied_params_map in order to allow to free memory.
                if name in self.tied_params_names and value.data_ptr() not in self.tied_params_map:
                    self.tied_params_map[value.data_ptr()] = {}

                if (
                    value is not None
                    and self.tied_params_map is not None
                    and value.data_ptr() in self.tied_params_map
                    and self.execution_device not in self.tied_params_map[value.data_ptr()]
                ):
                    self.tied_pointers_to_remove.add((value.data_ptr(), self.execution_device))

                set_module_tensor_to_device(
                    module,
                    name,
                    self.execution_device,
                    value=value,
                    fp16_statistics=fp16_statistics,
                    tied_params_map=self.tied_params_map,
                )

        return send_to_device(args, self.execution_device), send_to_device(
            kwargs, self.execution_device, skip_keys=self.skip_keys
        )

    @_compiler_disable
    def post_forward(self, module, output):
        if self.offload:
            for name, _ in named_module_tensors(
                module,
                include_buffers=self.offload_buffers,
                recurse=self.place_submodules,
                remove_non_persistent=True,
            ):
                set_module_tensor_to_device(module, name, "meta")
                if type(module).__name__ == "Linear8bitLt":
                    module.state.SCB = None
                    module.state.CxB = None

            # We may have loaded tied weights into self.tied_params_map (avoiding to load them several times in e.g. submodules): remove them from
            # this dictionary to allow the garbage collector to do its job.
            for value_pointer, device in self.tied_pointers_to_remove:
                if isinstance(device, int):
                    if is_npu_available():
                        device = f"npu:{device}"
                    elif is_mlu_available():
                        device = f"mlu:{device}"
                    elif is_musa_available():
                        device = f"musa:{device}"
                if device in self.tied_params_map[value_pointer]:
                    del self.tied_params_map[value_pointer][device]
            self.tied_pointers_to_remove = set()
        if self.io_same_device and self.input_device is not None:
            output = send_to_device(output, self.input_device, skip_keys=self.skip_keys)

        return output

    def detach_hook(self, module):
        if self.offload:
            for name, device in self.original_devices.items():
                if device != torch.device("meta"):
                    set_module_tensor_to_device(module, name, device, value=self.weights_map.get(name, None))
        return module


def attach_execution_device_hook(
    module: torch.nn.Module,
    execution_device: Union[int, str, torch.device],
    skip_keys: Optional[Union[str, list[str]]] = None,
    preload_module_classes: Optional[list[str]] = None,
    tied_params_map: Optional[dict[int, dict[torch.device, torch.Tensor]]] = None,
):
    """
    Recursively attaches `AlignDevicesHook` to all submodules of a given model to make sure they have the right
    execution device

    Args:
        module (`torch.nn.Module`):
            The module where we want to attach the hooks.
        execution_device (`int`, `str` or `torch.device`):
            The device on which inputs and model weights should be placed before the forward pass.
        skip_keys (`str` or `List[str]`, *optional*):
            A list of keys to ignore when moving inputs or outputs between devices.
        preload_module_classes (`List[str]`, *optional*):
            A list of classes whose instances should load all their weights (even in the submodules) at the beginning
            of the forward. This should only be used for classes that have submodules which are registered but not
            called directly during the forward, for instance if a `dense` linear layer is registered, but at forward,
            `dense.weight` and `dense.bias` are used in some operations instead of calling `dense` directly.
        tied_params_map (Optional[Dict[int, Dict[torch.device, torch.Tensor]]], *optional*, defaults to `None`):
            A map of data pointers to dictionaries of devices to already dispatched tied weights. For a given execution
            device, this parameter is useful to reuse the first available pointer of a shared weight for all others,
            instead of duplicating memory.
    """
    if not hasattr(module, "_hf_hook") and len(module.state_dict()) > 0:
        add_hook_to_module(
            module,
            AlignDevicesHook(execution_device, skip_keys=skip_keys, tied_params_map=tied_params_map),
        )

    # Break the recursion if we get to a preload module.
    if preload_module_classes is not None and module.__class__.__name__ in preload_module_classes:
        return

    for child in module.children():
        attach_execution_device_hook(
            child,
            execution_device,
            skip_keys=skip_keys,
            preload_module_classes=preload_module_classes,
            tied_params_map=tied_params_map,
        )


def attach_align_device_hook(
    module: torch.nn.Module,
    execution_device: Optional[torch.device] = None,
    offload: bool = False,
    weights_map: Optional[Mapping] = None,
    offload_buffers: bool = False,
    module_name: str = "",
    skip_keys: Optional[Union[str, list[str]]] = None,
    preload_module_classes: Optional[list[str]] = None,
    tied_params_map: Optional[dict[int, dict[torch.device, torch.Tensor]]] = None,
):
    """
    Recursively attaches `AlignDevicesHook` to all submodules of a given model that have direct parameters and/or
    buffers.

    Args:
        module (`torch.nn.Module`):
            The module where we want to attach the hooks.
        execution_device (`torch.device`, *optional*):
            The device on which inputs and model weights should be placed before the forward pass.
        offload (`bool`, *optional*, defaults to `False`):
            Whether or not the weights should be offloaded after the forward pass.
        weights_map (`Mapping[str, torch.Tensor]`, *optional*):
            When the model weights are offloaded, a (potentially lazy) map from param names to the tensor values.
        offload_buffers (`bool`, *optional*, defaults to `False`):
            Whether or not to include the associated module's buffers when offloading.
        module_name (`str`, *optional*, defaults to `""`):
            The name of the module.
        skip_keys (`str` or `List[str]`, *optional*):
            A list of keys to ignore when moving inputs or outputs between devices.
        preload_module_classes (`List[str]`, *optional*):
            A list of classes whose instances should load all their weights (even in the submodules) at the beginning
            of the forward. This should only be used for classes that have submodules which are registered but not
            called directly during the forward, for instance if a `dense` linear layer is registered, but at forward,
            `dense.weight` and `dense.bias` are used in some operations instead of calling `dense` directly.
        tied_params_map (Optional[Dict[int, Dict[torch.device, torch.Tensor]]], *optional*, defaults to `None`):
            A map of data pointers to dictionaries of devices to already dispatched tied weights. For a given execution
            device, this parameter is useful to reuse the first available pointer of a shared weight for all others,
            instead of duplicating memory.
    """
    # Attach the hook on this module if it has any direct tensor.
    directs = named_module_tensors(module)
    full_offload = (
        offload and preload_module_classes is not None and module.__class__.__name__ in preload_module_classes
    )

    if len(list(directs)) > 0 or full_offload:
        if weights_map is not None:
            prefix = f"{module_name}." if len(module_name) > 0 else ""
            prefixed_weights_map = PrefixedDataset(weights_map, prefix)
        else:
            prefixed_weights_map = None
        hook = AlignDevicesHook(
            execution_device=execution_device,
            offload=offload,
            weights_map=prefixed_weights_map,
            offload_buffers=offload_buffers,
            place_submodules=full_offload,
            skip_keys=skip_keys,
            tied_params_map=tied_params_map,
        )
        add_hook_to_module(module, hook, append=True)

    # We stop the recursion in case we hit the full offload.
    if full_offload:
        return

    # Recurse on all children of the module.
    for child_name, child in module.named_children():
        child_name = f"{module_name}.{child_name}" if len(module_name) > 0 else child_name
        attach_align_device_hook(
            child,
            execution_device=execution_device,
            offload=offload,
            weights_map=weights_map,
            offload_buffers=offload_buffers,
            module_name=child_name,
            preload_module_classes=preload_module_classes,
            skip_keys=skip_keys,
            tied_params_map=tied_params_map,
        )


def remove_hook_from_submodules(module: nn.Module):
    """
    Recursively removes all hooks attached on the submodules of a given model.

    Args:
        module (`torch.nn.Module`): The module on which to remove all hooks.
    """
    remove_hook_from_module(module)
    for child in module.children():
        remove_hook_from_submodules(child)


def attach_align_device_hook_on_blocks(
    module: nn.Module,
    execution_device: Optional[Union[torch.device, dict[str, torch.device]]] = None,
    offload: Union[bool, dict[str, bool]] = False,
    weights_map: Optional[Mapping] = None,
    offload_buffers: bool = False,
    module_name: str = "",
    skip_keys: Optional[Union[str, list[str]]] = None,
    preload_module_classes: Optional[list[str]] = None,
    tied_params_map: Optional[dict[int, dict[torch.device, torch.Tensor]]] = None,
):
    """
    Attaches `AlignDevicesHook` to all blocks of a given model as needed.

    Args:
        module (`torch.nn.Module`):
            The module where we want to attach the hooks.
        execution_device (`torch.device` or `Dict[str, torch.device]`, *optional*):
            The device on which inputs and model weights should be placed before the forward pass. It can be one device
            for the whole module, or a dictionary mapping module name to device.
        offload (`bool`, *optional*, defaults to `False`):
            Whether or not the weights should be offloaded after the forward pass. It can be one boolean for the whole
            module, or a dictionary mapping module name to boolean.
        weights_map (`Mapping[str, torch.Tensor]`, *optional*):
            When the model weights are offloaded, a (potentially lazy) map from param names to the tensor values.
        offload_buffers (`bool`, *optional*, defaults to `False`):
            Whether or not to include the associated module's buffers when offloading.
        module_name (`str`, *optional*, defaults to `""`):
            The name of the module.
        skip_keys (`str` or `List[str]`, *optional*):
            A list of keys to ignore when moving inputs or outputs between devices.
        preload_module_classes (`List[str]`, *optional*):
            A list of classes whose instances should load all their weights (even in the submodules) at the beginning
            of the forward. This should only be used for classes that have submodules which are registered but not
            called directly during the forward, for instance if a `dense` linear layer is registered, but at forward,
            `dense.weight` and `dense.bias` are used in some operations instead of calling `dense` directly.
        tied_params_map (Optional[Dict[int, Dict[torch.device, torch.Tensor]]], *optional*, defaults to `None`):
            A map of data pointers to dictionaries of devices to already dispatched tied weights. For a given execution
            device, this parameter is useful to reuse the first available pointer of a shared weight for all others,
            instead of duplicating memory.
    """
    # If one device and one offload, we've got one hook.
    if not isinstance(execution_device, Mapping) and not isinstance(offload, dict):
        if not offload:
            hook = AlignDevicesHook(
                execution_device=execution_device,
                io_same_device=True,
                skip_keys=skip_keys,
                place_submodules=True,
                tied_params_map=tied_params_map,
            )
            add_hook_to_module(module, hook)
        else:
            attach_align_device_hook(
                module,
                execution_device=execution_device,
                offload=True,
                weights_map=weights_map,
                offload_buffers=offload_buffers,
                module_name=module_name,
                skip_keys=skip_keys,
                tied_params_map=tied_params_map,
            )
        return

    if not isinstance(execution_device, Mapping):
        execution_device = {key: execution_device for key in offload.keys()}
    if not isinstance(offload, Mapping):
        offload = {key: offload for key in execution_device.keys()}

    if module_name in execution_device and module_name in offload and not offload[module_name]:
        hook = AlignDevicesHook(
            execution_device=execution_device[module_name],
            offload_buffers=offload_buffers,
            io_same_device=(module_name == ""),
            place_submodules=True,
            skip_keys=skip_keys,
            tied_params_map=tied_params_map,
        )
        add_hook_to_module(module, hook)
        attach_execution_device_hook(
            module, execution_device[module_name], skip_keys=skip_keys, tied_params_map=tied_params_map
        )
    elif module_name in execution_device and module_name in offload:
        attach_align_device_hook(
            module,
            execution_device=execution_device[module_name],
            offload=True,
            weights_map=weights_map,
            offload_buffers=offload_buffers,
            module_name=module_name,
            skip_keys=skip_keys,
            preload_module_classes=preload_module_classes,
            tied_params_map=tied_params_map,
        )
        if not hasattr(module, "_hf_hook"):
            hook = AlignDevicesHook(
                execution_device=execution_device[module_name],
                io_same_device=(module_name == ""),
                skip_keys=skip_keys,
                tied_params_map=tied_params_map,
            )
            add_hook_to_module(module, hook)
        attach_execution_device_hook(
            module,
            execution_device[module_name],
            preload_module_classes=preload_module_classes,
            skip_keys=skip_keys,
            tied_params_map=tied_params_map,
        )
    elif module_name == "":
        hook = AlignDevicesHook(
            execution_device=execution_device.get(""),
            io_same_device=True,
            skip_keys=skip_keys,
            tied_params_map=tied_params_map,
        )
        add_hook_to_module(module, hook)

    for child_name, child in module.named_children():
        child_name = f"{module_name}.{child_name}" if len(module_name) > 0 else child_name
        attach_align_device_hook_on_blocks(
            child,
            execution_device=execution_device,
            offload=offload,
            weights_map=weights_map,
            offload_buffers=offload_buffers,
            module_name=child_name,
            preload_module_classes=preload_module_classes,
            skip_keys=skip_keys,
            tied_params_map=tied_params_map,
        )


class CpuOffload(ModelHook):
    """
    Offloads a model on the CPU until its forward pass is called. The model will not be offloaded back to the CPU after
    the forward, the user needs to call the `init_hook` method again for this.

    Args:
        execution_device(`str`, `int` or `torch.device`, *optional*):
            The device on which the model should be executed. Will default to the MPS device if it's available, then
            GPU 0 if there is a GPU, and finally to the CPU.
        prev_module_hook (`UserCpuOffloadHook`, *optional*):
            The hook sent back by [`cpu_offload_with_hook`] for a previous model in the pipeline you are running. If
            passed, its offload method will be called just before the forward of the model to which this hook is
            attached.
    """

    def __init__(
        self,
        execution_device: Optional[Union[str, int, torch.device]] = None,
        prev_module_hook: Optional["UserCpuOffloadHook"] = None,
    ):
        self.prev_module_hook = prev_module_hook

        self.execution_device = execution_device if execution_device is not None else PartialState().default_device

    def init_hook(self, module):
        return module.to("cpu")

    @_compiler_disable
    def pre_forward(self, module, *args, **kwargs):
        if self.prev_module_hook is not None and isinstance(self.prev_module_hook, UserCpuOffloadHook):
            prev_module = self.prev_module_hook.model
            prev_device = next(prev_module.parameters()).device

            # Only offload the previous module if it is not already on CPU.
            if prev_device != torch.device("cpu"):
                self.prev_module_hook.offload()
                clear_device_cache()

        # If the current device is already the self.execution_device, we can skip the transfer.
        current_device = next(module.parameters()).device
        if current_device == self.execution_device:
            return args, kwargs

        module.to(self.execution_device)
        return send_to_device(args, self.execution_device), send_to_device(kwargs, self.execution_device)


class UserCpuOffloadHook:
    """
    A simple hook grouping a model and a `ModelHook`, which provides easy APIs for to call the init method of the hook
    or remove it entirely.
    """

    def __init__(self, model, hook):
        self.model = model
        self.hook = hook

    def offload(self):
        self.hook.init_hook(self.model)

    def remove(self):
        remove_hook_from_module(self.model)


class LayerwiseCastingHook(ModelHook):
    r"""
    A hook that casts the weights of a module to a high precision dtype for computation, and to a low precision dtype
    for storage. This process may lead to quality loss in the output, but can significantly reduce the memory
    footprint.
    """

    _is_stateful = False

    def __init__(self, storage_dtype: torch.dtype, compute_dtype: torch.dtype, non_blocking: bool) -> None:
        self.storage_dtype = storage_dtype
        self.compute_dtype = compute_dtype
        self.non_blocking = non_blocking

    def init_hook(self, module: torch.nn.Module):
        module.to(dtype=self.storage_dtype, non_blocking=self.non_blocking)
        return module

    @_compiler_disable
    def pre_forward(self, module: torch.nn.Module, *args, **kwargs):
        module.to(dtype=self.compute_dtype, non_blocking=self.non_blocking)
        return args, kwargs

    @_compiler_disable
    def post_forward(self, module: torch.nn.Module, output):
        module.to(dtype=self.storage_dtype, non_blocking=self.non_blocking)
        return output


================================================
FILE: src/accelerate/inference.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
from types import MethodType
from typing import Any, Optional, Union

from .state import PartialState
from .utils import (
    calculate_maximum_sizes,
    convert_bytes,
    copy_tensor_to_devices,
    ignorant_find_batch_size,
    infer_auto_device_map,
    is_pippy_available,
    pad_input_tensors,
    send_to_device,
)


def generate_device_map(
    model, num_processes: int = 1, no_split_module_classes=None, max_memory: Optional[dict] = None
):
    """
    Calculates the device map for `model` with an offset for PiPPy
    """
    if num_processes == 1:
        return infer_auto_device_map(model, no_split_module_classes=no_split_module_classes, clean_result=False)
    if max_memory is None:
        model_size, shared = calculate_maximum_sizes(model)

        # Split into `n` chunks for each GPU
        memory = (model_size + shared[0]) / num_processes
        memory = convert_bytes(memory)
        value, ending = memory.split(" ")

        # Add a chunk to deal with potential extra shared memory instances
        memory = math.ceil(float(value)) * 1.1
        memory = f"{memory} {ending}"
        max_memory = {i: memory for i in range(num_processes)}
    device_map = infer_auto_device_map(
        model,
        max_memory=max_memory,
        no_split_module_classes=no_split_module_classes,
        clean_result=False,
    )
    return device_map


def find_pippy_batch_size(args, kwargs):
    found_batch_size = None
    if args is not None:
        for arg in args:
            found_batch_size = ignorant_find_batch_size(arg)
            if found_batch_size is not None:
                break
    if kwargs is not None and found_batch_size is None:
        for kwarg in kwargs.values():
            found_batch_size = ignorant_find_batch_size(kwarg)
            if found_batch_size is not None:
                break
    return found_batch_size


def build_pipeline(model, split_points, args, kwargs, num_chunks):
    """
    Attaches the split points to the model based on `self.device_map` and generates a `PipelineStage`. Requires passing
    in needed `args` and `kwargs` as the model needs on the CPU.

    Users can pass in custom `num_chunks` as an optional hyper-parameter. By default will use
    `AcceleratorState.num_processes`
    """
    # Note: We import here to reduce import time from general modules, and isolate outside dependencies
    from torch.distributed.pipelining import ScheduleGPipe, SplitPoint, pipeline

    # We need to annotate the split points in the model for PiPPy
    state = PartialState()
    split_spec = {split_point: SplitPoint.BEGINNING for split_point in split_points}
    pipe = pipeline(
        model,
        mb_args=args,
        mb_kwargs=kwargs,
        split_spec=split_spec,
    )
    stage = pipe.build_stage(state.local_process_index, device=state.device)
    schedule = ScheduleGPipe(stage, num_chunks)

    return schedule


def pippy_forward(forward, num_chunks, gather_output, *args, **kwargs):
    state = PartialState()
    output = None

    if state.num_processes == 1:
        output = forward(*args, **kwargs)
    elif state.is_local_main_process:
        found_batch_size = find_pippy_batch_size(args, kwargs)
        if found_batch_size is None:
            raise ValueError("Could not find batch size from args or kwargs")
        else:
            if found_batch_size != num_chunks:
                args = pad_input_tensors(args, found_batch_size, num_chunks)
                kwargs = pad_input_tensors(kwargs, found_batch_size, num_chunks)
        forward(*args, **kwargs)
    elif state.is_last_process:
        output = forward()
    else:
        forward()
    if gather_output:
        # Each node will get a copy of the full output which is only on the last GPU
        output = copy_tensor_to_devices(output)
    return output


def prepare_pippy(
    model,
    split_points: Optional[Union[str, list[str]]] = "auto",
    no_split_module_classes: Optional[list[str]] = None,
    example_args: Optional[tuple[Any]] = (),
    example_kwargs: Optional[dict[str, Any]] = None,
    num_chunks: Optional[int] = None,
    gather_output: Optional[bool] = False,
):
    """
    Wraps `model` for pipeline parallel inference.

    Args:
        model (`torch.nn.Module`):
            A model we want to split for pipeline-parallel inference
        split_points (`str` or `List[str]`, defaults to 'auto'):
            How to generate the split points and chunk the model across each GPU. 'auto' will find the best balanced
            split given any model. Should be a list of layer names in the model to split by otherwise.
        no_split_module_classes (`List[str]`):
            A list of class names for layers we don't want to be split.
        example_args (tuple of model inputs):
            The expected inputs for the model that uses order-based inputs for a *single process*. Recommended to use
            this method if possible.
        example_kwargs (dict of model inputs)
            The expected inputs for the model that uses dictionary-based inputs for a *single process*. This is a
            *highly* limiting structure that requires the same keys be present at *all* inference calls. Not
            recommended unless the prior condition is true for all cases.
        num_chunks (`int`, defaults to the number of available GPUs):
            The number of different stages the Pipeline will have. By default it will assign one chunk per GPU, but
            this can be tuned and played with. In general one should have num_chunks >= num_gpus.
        gather_output (`bool`, defaults to `False`):
            If `True`, the output from the last GPU (which holds the true outputs) is sent across to all GPUs.
    """
    if not is_pippy_available():
        raise ImportError("Using `torch.distributed.pipelining` requires PyTorch 2.4.0 or later.")
    state = PartialState()
    example_args = send_to_device(example_args, "cpu")
    example_kwargs = send_to_device(example_kwargs, "cpu")
    if num_chunks is None:
        num_chunks = state.num_processes
    if split_points == "auto":
        device_map = generate_device_map(model, num_chunks, no_split_module_classes=no_split_module_classes)
        split_points = []
        for i in range(1, num_chunks):
            split_points.append(next(k for k, v in device_map.items() if v == i))
    model.hf_split_points = split_points
    stage = build_pipeline(model, split_points, example_args, example_kwargs, num_chunks)
    model._original_forward = model.forward
    model._original_call = model.__call__
    model.pippy_stage = stage
    model.hf_split_points = split_points

    def forward(*args, **kwargs):
        return pippy_forward(stage.step, num_chunks, gather_output, *args, **kwargs)

    # To act like a decorator so that it can be popped when doing `extract_model_from_parallel`
    # Note: creates an infinite recursion loop with `generate`
    model_forward = MethodType(forward, model)
    forward.__wrapped__ = model_forward
    model.forward = forward
    return model


================================================
FILE: src/accelerate/launchers.py
================================================
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import sys
import tempfile

import torch

from .state import AcceleratorState, PartialState
from .utils import (
    PrecisionType,
    PrepareForLaunch,
    are_libraries_initialized,
    check_cuda_p2p_ib_support,
    get_current_device_type,
    get_gpu_info,
    is_mps_available,
    is_torch_version,
    patch_environment,
)
from .utils.constants import ELASTIC_LOG_LINE_PREFIX_TEMPLATE_PYTORCH_VERSION


def test_launch():
    "Verify a `PartialState` can be initialized."
    _ = PartialState()


def notebook_launcher(
    function,
    args=(),
    num_processes=None,
    mixed_precision="no",
    use_port="29500",
    master_addr="127.0.0.1",
    node_rank=0,
    num_nodes=1,
    rdzv_backend="static",
    rdzv_endpoint="",
    rdzv_conf=None,
    rdzv_id="none",
    max_restarts=0,
    monitor_interval=0.1,
    log_line_prefix_template=None,
):
    """
    Launches a training function, using several processes or multiple nodes if it's possible in the current environment
    (TPU with multiple cores for instance).

    <Tip warning={true}>

    To use this function absolutely zero calls to a device must be made in the notebook session before calling. If any
    have been made, you will need to restart the notebook and make sure no cells use any device capability.

    Setting `ACCELERATE_DEBUG_MODE="1"` in your environment will run a test before truly launching to ensure that none
    of those calls have been made.

    </Tip>

    Args:
        function (`Callable`):
            The training function to execute. If it accepts arguments, the first argument should be the index of the
            process run.
        args (`Tuple`):
            Tuple of arguments to pass to the function (it will receive `*args`).
        num_processes (`int`, *optional*):
            The number of processes to use for training. Will default to 8 in Colab/Kaggle if a TPU is available, to
            the number of devices available otherwise.
        mixed_precision (`str`, *optional*, defaults to `"no"`):
            If `fp16` or `bf16`, will use mixed precision training on multi-device.
        use_port (`str`, *optional*, defaults to `"29500"`):
            The port to use to communicate between processes when launching a multi-device training.
        master_addr (`str`, *optional*, defaults to `"127.0.0.1"`):
            The address to use for communication between processes.
        node_rank (`int`, *optional*, defaults to 0):
            The rank of the current node.
        num_nodes (`int`, *optional*, defaults to 1):
            The number of nodes to use for training.
        rdzv_backend (`str`, *optional*, defaults to `"static"`):
            The rendezvous method to use, such as 'static' (the default) or 'c10d'
        rdzv_endpoint (`str`, *optional*, defaults to `""`):
            The endpoint of the rdzv sync. storage.
        rdzv_conf (`Dict`, *optional*, defaults to `None`):
            Additional rendezvous configuration.
        rdzv_id (`str`, *optional*, defaults to `"none"`):
            The unique run id of the job.
        max_restarts (`int`, *optional*, defaults to 0):
            The maximum amount of restarts that elastic agent will conduct on workers before failure.
        monitor_interval (`float`, *optional*, defaults to 0.1):
            The interval in seconds that is used by the elastic_agent as a period of monitoring workers.
        log_line_prefix_template (`str`, *optional*, defaults to `None`):
            The prefix template for elastic launch logging. Available from PyTorch 2.2.0.

    Example:

    ```python
    # Assume this is defined in a Jupyter Notebook on an instance with two devices
    from accelerate import notebook_launcher


    def train(*args):
        # Your training function here
        ...


    notebook_launcher(train, args=(arg1, arg2), num_processes=2, mixed_precision="fp16")
    ```
    """
    # Are we in a google colab or a Kaggle Kernel?
    in_colab = False
    in_kaggle = False
    if any(key.startswith("KAGGLE") for key in os.environ.keys()):
        in_kaggle = True
    elif "IPython" in sys.modules:
        in_colab = "google.colab" in str(sys.modules["IPython"].get_ipython())

    try:
        mixed_precision = PrecisionType(mixed_precision.lower())
    except ValueError:
        raise ValueError(
            f"Unknown mixed_precision mode: {args.mixed_precision.lower()}. Choose between {PrecisionType.list()}."
        )

    if (in_colab or in_kaggle) and (
        (os.environ.get("TPU_NAME", None) is not None) or (os.environ.get("PJRT_DEVICE", "") == "TPU")
    ):
        # TPU launch
        import torch_xla.distributed.xla_multiprocessing as xmp

        if len(AcceleratorState._shared_state) > 0:
            raise ValueError(
                "To train on TPU in Colab or Kaggle Kernel, the `Accelerator` should only be initialized inside "
                "your training function. Restart your notebook and make sure no cells initializes an "
                "`Accelerator`."
            )

        launcher = PrepareForLaunch(function, distributed_type="XLA")
        print("Launching a training on TPU cores.")
        xmp.spawn(launcher, args=args, start_method="fork")
    elif in_colab and (not torch.cuda.is_available() or get_gpu_info()[1] < 2):
        # No need for a distributed launch otherwise as it's either CPU or one GPU.
        if torch.cuda.is_available():
            print("Launching training on one GPU.")
        else:
            print("Launching training on one CPU.")
        function(*args)
    else:
        if num_processes is None:
            raise ValueError(
                "You have to specify the number of devices you would like to use, add `num_processes=...` to your call."
            )
        if node_rank >= num_nodes:
            raise ValueError("The node_rank must be less than the number of nodes.")
        if num_processes > 1:
            # Multi-device launch
            from torch.distributed.launcher.api import LaunchConfig, elastic_launch
            from torch.multiprocessing import start_processes
            from torch.multiprocessing.spawn import ProcessRaisedException

            if len(AcceleratorState._shared_state) > 0:
                raise ValueError(
                    "To launch a multi-device training from your notebook, the `Accelerator` should only be initialized "
                    "inside your training function. Restart your notebook and make sure no cells initializes an "
                    "`Accelerator`."
                )
            # Check for specific libraries known to initialize device that users constantly use
            problematic_imports = are_libraries_initialized("bitsandbytes")
            if len(problematic_imports) > 0:
                err = (
                    "Could not start distributed process. Libraries known to initialize device upon import have been "
                    "imported already. Please keep these imports inside your training function to try and help with this:"
                )
                for lib_name in problematic_imports:
                    err += f"\n\t* `{lib_name}`"
                raise RuntimeError(err)

            patched_env = dict(
                nproc=num_processes,
                node_rank=node_rank,
                world_size=num_nodes * num_processes,
                master_addr=master_addr,
                master_port=use_port,
                mixed_precision=mixed_precision,
            )

            # Check for CUDA P2P and IB issues
            if not check_cuda_p2p_ib_support():
                patched_env["nccl_p2p_disable"] = "1"
                patched_env["nccl_ib_disable"] = "1"

            # torch.distributed will expect a few environment variable to be here. We set the ones common to each
            # process here (the other ones will be set be the launcher).
            with patch_environment(**patched_env):
                # First dummy launch
                # Determine device type without initializing any device (which would break fork)
                device_type, distributed_type = get_current_device_type()
                # XPU requires spawn instead of fork
                start_method = "spawn" if device_type == "xpu" else "fork"
                if os.environ.get("ACCELERATE_DEBUG_MODE", "false").lower() == "true":
                    launcher = PrepareForLaunch(test_launch, distributed_type=distributed_type)
                    try:
                        start_processes(launcher, args=(), nprocs=num_processes, start_method=start_method)
                    except ProcessRaisedException as e:
                        err = "An issue was found when verifying a stable environment for the notebook launcher."
                        if f"Cannot re-initialize {device_type.upper()} in forked subprocess" in e.args[0]:
                            raise RuntimeError(
                                f"{err}"
                                "This likely stems from an outside import causing issues once the `notebook_launcher()` is called. "
                                "Please review your imports and test them when running the `notebook_launcher()` to identify "
                                f"which one is problematic and causing {device_type.upper()} to be initialized."
                            ) from e
                        else:
                            raise RuntimeError(f"{err} The following error was raised: {e}") from e
                # Now the actual launch
                launcher = PrepareForLaunch(function, distributed_type=distributed_type)
                print(f"Launching training on {num_processes} {device_type.upper()}s.")
                try:
                    if rdzv_conf is None:
                        rdzv_conf = {}
                    if rdzv_backend == "static":
                        rdzv_conf["rank"] = node_rank
                        if not rdzv_endpoint:
                            rdzv_endpoint = f"{master_addr}:{use_port}"
                    launch_config_kwargs = dict(
                        min_nodes=num_nodes,
                        max_nodes=num_nodes,
                        nproc_per_node=num_processes,
                        run_id=rdzv_id,
                        rdzv_endpoint=rdzv_endpoint,
                        rdzv_backend=rdzv_backend,
                        rdzv_configs=rdzv_conf,
                        max_restarts=max_restarts,
                        monitor_interval=monitor_interval,
                        start_method=start_method,
                    )
                    if is_torch_version(">=", ELASTIC_LOG_LINE_PREFIX_TEMPLATE_PYTORCH_VERSION):
                        launch_config_kwargs["log_line_prefix_template"] = log_line_prefix_template
                    elastic_launch(config=LaunchConfig(**launch_config_kwargs), entrypoint=function)(*args)
                except ProcessRaisedException as e:
                    if f"Cannot re-initialize {device_type.upper()} in forked subprocess" in e.args[0]:
                        raise RuntimeError(
                            f"{device_type.upper()} has been initialized before the `notebook_launcher` could create a forked subprocess. "
                            "This likely stems from an outside import causing issues once the `notebook_launcher()` is called. "
                            "Please review your imports and test them when running the `notebook_launcher()` to identify "
                            f"which one is problematic and causing {device_type.upper()} to be initialized."
                        ) from e
                    else:
                        raise RuntimeError(f"An issue was found when launching the training: {e}") from e

        else:
            # No need for a distributed launch otherwise as it's either CPU, GPU, XPU or MPS.
            if is_mps_available():
                os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
                print("Launching training on MPS.")
            elif torch.cuda.is_available():
                print("Launching training on one GPU.")
            elif torch.xpu.is_available():
                print("Launching training on one XPU.")
            else:
                print("Launching training on CPU.")
            function(*args)


def debug_launcher(function, args=(), num_processes=2):
    """
    Launches a training function using several processes on CPU for debugging purposes.

    <Tip warning={true}>

    This function is provided for internal testing and debugging, but it's not intended for real trainings. It will
    only use the CPU.

    </Tip>

    Args:
        function (`Callable`):
            The training function to execute.
        args (`Tuple`):
            Tuple of arguments to pass to the function (it will receive `*args`).
        num_processes (`int`, *optional*, defaults to 2):
            The number of processes to use for training.
    """
    from torch.multiprocessing import start_processes

    with tempfile.NamedTemporaryFile() as tmp_file:
        # torch.distributed will expect a few environment variable to be here. We set the ones common to each
        # process here (the other ones will be set be the launcher).
        with patch_environment(
            world_size=num_processes,
            master_addr="127.0.0.1",
            master_port="29500",
            accelerate_mixed_precision="no",
            accelerate_debug_rdv_file=tmp_file.name,
            accelerate_use_cpu="yes",
        ):
            launcher = PrepareForLaunch(function, debug=True)
            start_processes(launcher, args=args, nprocs=num_processes, start_method="fork")


================================================
FILE: src/accelerate/local_sgd.py
================================================
# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch

from accelerate import Accelerator, DistributedType


class LocalSGD:
    """
    A helper class to support local SGD on top of Accelerator. It simply runs a given number of updates independently
    on each device, and averages model weights every K synchronization step.

    It should be used only in the multi-GPU (or multi-CPU) setup without extensions such as DeepSpeed. In particular,
    this is a simple implementation that cannot support scenarios such as model parallelism.


    Although we are not aware of the true origins of this simple approach, the idea of local SGD is quite old and goes
    back to at least:

    Zhang, J., De Sa, C., Mitliagkas, I., & Ré, C. (2016). [Parallel SGD: When does averaging help?. arXiv preprint
    arXiv:1606.07365.](https://huggingface.co/papers/1606.07365)

    We credit the term Local SGD to the following paper (but there might be earlier references we are not aware of).

    Stich, Sebastian Urban. ["Local SGD Converges Fast and Communicates Little." ICLR 2019-International Conference on
    Learning Representations. No. CONF. 2019.](https://huggingface.co/papers/1805.09767)

    """

    def __enter__(self):
        if self.enabled:
            self.model_sync_obj = self.model.no_sync()
            self.model_sync_obj.__enter__()

        return self

    def __exit__(self, type, value, tb):
        if self.enabled:
            # Average all models on exit
            self._sync_and_avg_model_params()
            self.model_sync_obj.__exit__(type, value, tb)

    def __init__(self, accelerator: Accelerator, model: torch.nn.Module, local_sgd_steps: int, enabled: bool = True):
        """
        Constructor.

        Args:
            model (`torch.nn.Module):
                The model whose parameters we need to average.
            accelerator (`Accelerator`):
                Accelerator object.
            local_sgd_steps (`int`):
                A number of local SGD steps (before model parameters are synchronized).
            enabled (`bool):
                Local SGD is disabled if this parameter set to `False`.
        """
        if accelerator.distributed_type not in [
            DistributedType.NO,
            DistributedType.MULTI_CPU,
            DistributedType.MULTI_GPU,
            DistributedType.MULTI_XPU,
            DistributedType.MULTI_MLU,
            DistributedType.MULTI_HPU,
            DistributedType.MULTI_SDAA,
            DistributedType.MULTI_MUSA,
            DistributedType.MULTI_NPU,
            DistributedType.MULTI_NEURON,
        ]:
            raise NotImplementedError("LocalSGD is supported only for CPUs and GPUs (no DeepSpeed or MegatronLM)")
        self.enabled = enabled and accelerator.distributed_type != DistributedType.NO
        self.num_steps = 0
        if self.enabled:
            self.accelerator = accelerator
            self.model = model
            self.local_sgd_steps = local_sgd_steps

    def step(self):
        """
        This function makes a "step" and synchronizes model parameters if necessary.
        """
        self.num_steps += 1
        if not self.enabled:
            return

        if self.num_steps % self.local_sgd_steps == 0:
            self._sync_and_avg_model_params()

    def _sync_and_avg_model_params(self):
        """
        Synchronize + Average model parameters across all GPUs
        """

        self.accelerator.wait_for_everyone()
        with self.accelerator.autocast():
            for param in self.model.parameters():
                param.data = self.accelerator.reduce(param.data, reduction="mean")


================================================
FILE: src/accelerate/logging.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations

import functools
import logging
import os

from .state import PartialState


class MultiProcessAdapter(logging.LoggerAdapter):
    """
    An adapter to assist with logging in multiprocess.

    `log` takes in an additional `main_process_only` kwarg, which dictates whether it should be called on all processes
    or only the main executed one. Default is `main_process_only=True`.

    Does not require an `Accelerator` object to be created first.
    """

    @staticmethod
    def _should_log(main_process_only):
        "Check if log should be performed"
        state = PartialState()
        return not main_process_only or (main_process_only and state.is_main_process)

    def process(self, msg, kwargs):
        msg, kwargs = super().process(msg, kwargs)

        # set `stacklevel` to exclude ourself in `Logger.findCaller()` while respecting user's choice
        kwargs.setdefault("stacklevel", 2)

        state = PartialState()
        msg = f"[RANK {state.process_index}] {msg}"
        return msg, kwargs

    def log(self, level, msg, *args, **kwargs):
        """
        Delegates logger call after checking if we should log.

        Accepts a new kwarg of `main_process_only`, which will dictate whether it will be logged across all processes
        or only the main executed one. Default is `True` if not passed

        Also accepts "in_order", which if `True` makes the processes log one by one, in order. This is much easier to
        read, but comes at the cost of sometimes needing to wait for the other processes. Default is `False` to not
        break with the previous behavior.

        `main_process_only` is ignored if `in_order` is passed.
        """
        if PartialState._shared_state == {}:
            raise RuntimeError(
                "You must initialize the accelerate state by calling either `PartialState()` or `Accelerator()` before using the logging utility."
            )
        main_process_only = kwargs.pop("main_process_only", True)
        in_order = kwargs.pop("in_order", False)

        if self.isEnabledFor(level):
            msg, kwargs = self.process(msg, kwargs)
            if not in_order and self._should_log(main_process_only):
                self.logger.log(level, msg, *args, **kwargs)

            elif in_order:
                state = PartialState()
                for i in range(state.num_processes):
                    if i == state.process_index:
                        self.logger.log(level, msg, *args, **kwargs)
                    state.wait_for_everyone()

    @functools.lru_cache(None)
    def warning_once(self, *args, **kwargs):
        """
        This method is identical to `logger.warning()`, but will emit the warning with the same message only once

        Note: The cache is for the function arguments, so 2 different callers using the same arguments will hit the
        cache. The assumption here is that all warning messages are unique across the code. If they aren't then need to
        switch to another type of cache that includes the caller frame information in the hashing function.
        """
        self.warning(*args, **kwargs)


def get_logger(name: str, log_level: str | None = None):
    """
    Returns a `logging.Logger` for `name` that can handle multiprocessing.

    If a log should be called on all processes, pass `main_process_only=False` If a log should be called on all
    processes and in order, also pass `in_order=True`

    Args:
        name (`str`):
            The name for the logger, such as `__file__`
        log_level (`str`, *optional*):
            The log level to use. If not passed, will default to the `LOG_LEVEL` environment variable, or `INFO` if not

    Example:

    ```python
    >>> from accelerate.logging import get_logger
    >>> from accelerate import Accelerator

    >>> logger = get_logger(__name__)

    >>> accelerator = Accelerator()
    >>> logger.info("My log", main_process_only=False)
    >>> logger.debug("My log", main_process_only=True)

    >>> logger = get_logger(__name__, log_level="DEBUG")
    >>> logger.info("My log")
    >>> logger.debug("My second log")

    >>> array = ["a", "b", "c", "d"]
    >>> letter_at_rank = array[accelerator.process_index]
    >>> logger.info(letter_at_rank, in_order=True)
    ```
    """
    if log_level is None:
        log_level = os.environ.get("ACCELERATE_LOG_LEVEL", None)
    logger = logging.getLogger(name)
    if log_level is not None:
        logger.setLevel(log_level.upper())
        logger.root.setLevel(log_level.upper())
    return MultiProcessAdapter(logger, {})


================================================
FILE: src/accelerate/memory_utils.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import warnings


warnings.warn(
    "memory_utils has been reorganized to utils.memory. Import `find_executable_batchsize` from the main `__init__`: "
    "`from accelerate import find_executable_batch_size` to avoid this warning.",
    FutureWarning,
)


================================================
FILE: src/accelerate/optimizer.py
================================================
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import inspect

import torch

from .state import AcceleratorState, GradientState
from .utils import DistributedType, honor_type, is_lomo_available, is_torch_xla_available


if is_torch_xla_available():
    import torch_xla.core.xla_model as xm
    import torch_xla.runtime as xr


def move_to_device(state, device):
    if isinstance(state, (list, tuple)):
        return honor_type(state, (move_to_device(t, device) for t in state))
    elif isinstance(state, dict):
        return type(state)({k: move_to_device(v, device) for k, v in state.items()})
    elif isinstance(state, torch.Tensor):
        return state.to(device)
    return state


class AcceleratedOptimizer(torch.optim.Optimizer):
    """
    Internal wrapper around a torch optimizer.

    Conditionally will perform `step` and `zero_grad` if gradients should be synchronized when performing gradient
    accumulation.

    Args:
        optimizer (`torch.optim.optimizer.Optimizer`):
            The optimizer to wrap.
        device_placement (`bool`, *optional*, defaults to `True`):
            Whether or not the optimizer should handle device placement. If so, it will place the state dictionary of
            `optimizer` on the right device.
        scaler (`torch.amp.GradScaler` or `torch.cuda.amp.GradScaler`, *optional*):
            The scaler to use in the step function if training with mixed precision.
    """

    def __init__(self, optimizer, device_placement=True, scaler=None):
        self.optimizer = optimizer
        self.scaler = scaler
        self.accelerator_state = AcceleratorState()
        self.gradient_state = GradientState()
        self.device_placement = device_placement
        self._is_overflow = False

        if self.scaler is not None:
            self._accelerate_step_called = False
            self._optimizer_original_step_method = self.optimizer.step
            self._optimizer_patched_step_method = patch_optimizer_step(self, self.optimizer.step)

        # Handle device placement
        if device_placement:
            state_dict = self.optimizer.state_dict()
            if self.accelerator_state.distributed_type == DistributedType.XLA:
                xm.send_cpu_data_to_device(state_dict, self.accelerator_state.device)
            else:
                state_dict = move_to_device(state_dict, self.accelerator_state.device)
            self.optimizer.load_state_dict(state_dict)

    @property
    def state(self):
        return self.optimizer.state

    @state.setter
    def state(self, state):
        self.optimizer.state = state

    @property
    def param_groups(self):
        return self.optimizer.param_groups

    @param_groups.setter
    def param_groups(self, param_groups):
        self.optimizer.param_groups = param_groups

    @property
    def defaults(self):
        return self.optimizer.defaults

    @defaults.setter
    def defaults(self, defaults):
        self.optimizer.defaults = defaults

    def add_param_group(self, param_group):
        self.optimizer.add_param_group(param_group)

    def load_state_dict(self, state_dict):
        if self.accelerator_state.distributed_type == DistributedType.XLA and self.device_placement:
            xm.send_cpu_data_to_device(state_dict, self.accelerator_state.device)
        self.optimizer.load_state_dict(state_dict)

    def state_dict(self):
        return self.optimizer.state_dict()

    def zero_grad(self, set_to_none=None):
        if self.gradient_state.sync_gradients:
            accept_arg = "set_to_none" in inspect.signature(self.optimizer.zero_grad).parameters
            if accept_arg:
                if set_to_none is None:
                    set_to_none = True
                self.optimizer.zero_grad(set_to_none=set_to_none)
            else:
                if set_to_none is not None:
                    raise ValueError("`set_to_none` for Optimizer.zero_grad` is not supported by this optimizer.")
                self.optimizer.zero_grad()

    def train(self):
        """
        Sets the optimizer to "train" mode. Useful for optimizers like `schedule_free`
        """
        if hasattr(self.optimizer, "train") and callable(self.optimizer.train):
            self.optimizer.train()
        elif (
            hasattr(self.optimizer, "optimizer")
            and hasattr(self.optimizer.optimizer, "train")
            and callable(self.optimizer.optimizer.train)
        ):
            # the deepspeed optimizer further wraps the optimizer
            self.optimizer.optimizer.train()

    def eval(self):
        """
        Sets the optimizer to "eval" mode. Useful for optimizers like `schedule_free`
        """
        if hasattr(self.optimizer, "eval") and callable(self.optimizer.eval):
            self.optimizer.eval()

    def step(self, closure=None):
        if is_lomo_available():
            from lomo_optim import AdaLomo, Lomo

        if (
            not self.gradient_state.is_xla_gradients_synced
            and self.accelerator_state.distributed_type == DistributedType.XLA
        ):
            gradients = xm._fetch_gradients(self.optimizer)
            xm.all_reduce("sum", gradients, scale=1.0 / xr.world_size())
            self.gradient_state.is_xla_gradients_synced = True

        if is_lomo_available():
            #  `step` should be a no-op for LOMO optimizers.
            if isinstance(self.optimizer, (Lomo, AdaLomo)):
                return

        if self.gradient_state.sync_gradients:
            if self.scaler is not None:
                self.optimizer.step = self._optimizer_patched_step_method

                self.scaler.step(self.optimizer, closure)
                self.scaler.update()

                if not self._accelerate_step_called:
                    # If the optimizer step was skipped, gradient overflow was detected.
                    self._is_overflow = True
                else:
                    self._is_overflow = False
                # Reset the step method to the original one
                self.optimizer.step = self._optimizer_original_step_method
                # Reset the indicator
                self._accelerate_step_called = False
            else:
                self.optimizer.step(closure)
        if self.accelerator_state.distributed_type == DistributedType.XLA:
            self.gradient_state.is_xla_gradients_synced = False

    def _switch_parameters(self, parameters_map):
        for param_group in self.optimizer.param_groups:
            param_group["params"] = [parameters_map.get(p, p) for p in param_group["params"]]

    @property
    def step_was_skipped(self):
        """Whether or not the optimizer step was skipped."""
        return self._is_overflow

    def __getstate__(self):
        _ignored_keys = [
            "_accelerate_step_called",
            "_optimizer_original_step_method",
            "_optimizer_patched_step_method",
        ]
        return {k: v for k, v in self.__dict__.items() if k not in _ignored_keys}

    def __setstate__(self, state):
        self.__dict__.update(state)
        if self.scaler is not None:
            self._accelerate_step_called = False
            self._optimizer_original_step_method = self.optimizer.step
            self._optimizer_patched_step_method = patch_optimizer_step(self, self.optimizer.step)


def patch_optimizer_step(accelerated_optimizer: AcceleratedOptimizer, method):
    def patched_step(*args, **kwargs):
        accelerated_optimizer._accelerate_step_called = True
        return method(*args, **kwargs)

    return patched_step


================================================
FILE: src/accelerate/parallelism_config.py
================================================
#
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import warnings
from dataclasses import dataclass
from typing import TYPE_CHECKING, Literal, Optional, Union

from accelerate.utils.dataclasses import (
    DeepSpeedSequenceParallelConfig,
    DistributedType,
    TorchContextParallelConfig,
    TorchTensorParallelConfig,
)
from accelerate.utils.versions import is_torch_version


if TYPE_CHECKING:
    from accelerate import Accelerator


@dataclass
class ParallelismConfig:
    """
    A dataclass to configure parallelisms applied to the model. Inspired by torchtitan's `ParallelDims`
    https://github.com/pytorch/torchtitan/blob/main/torchtitan/distributed/parallel_dims.py

    Args:
        dp_replicate_size (`int`, defaults to `1`):
            The size of the data parallel group. If `dp_replicate_size` is set to 1, the data parallel replication
            group will not be used.
        dp_shard_size (`int`, defaults to `1`):
            The size of the model shard group. If `dp_replicate_size > 1` and `tp_size > 1`, `dp_shard_size` must also
            be greater than 1, as composing DDP + TP is currently not supported.
        tp_size (`int`, defaults to `1`):
            The size of the tensor parallel group. If `tp_size` is set to `1`, the tensor parallel group will not be
            used.
        tp_handler (`~utils.TorchTensorParallelConfig`, defaults to `None`):
            The handler for the tensor parallel group.
        cp_size (`int`, defaults to `1`):
            The size of the context parallel group. Currently not supported, but reserved for future use and enabled
            for downstream libraries.
        cp_backend (`str`, defaults to `torch`):
            Which CP backend to use: `torch` (FSDP2)
        sp_size (`int`, defaults to `1`):
            The size of the sequence parallel group.
        sp_backend (`str`, defaults to `deepspeed`):
            Which SP backend to use:`deepspeed` (ALST/Ulysses)

    You may obtain different distributed data parallel paradigms by configuring `dp_replicate_size` and `dp_shard_size`
    together:
        - `dp_replicate_size == 1` and `dp_shard_size > 1`, we obtain Fully Sharded Data Parallel (FSDP).
        - `dp_replicate_size > 1` and `dp_shard_size > 1`, we obtain Hybrid Sharded Data Parallel (HSDP).
        - `dp_replicate_size > 1` and `dp_shard_size == 1` is an invalid configuration, to use pure DP, use
          `DistributedDataParallelKwargs` instead.

    """

    dp_replicate_size: Optional[int] = None
    dp_shard_size: Optional[int] = None
    tp_size: Optional[int] = None
    cp_size: Optional[int] = None
    cp_backend: Literal["torch"] = None
    sp_size: Optional[int] = None
    sp_backend: Literal["deepspeed"] = None

    # we use Union because we might support other x parallel plugins (i.e. deepspeed, etc)
    tp_handler: Union[None, TorchTensorParallelConfig] = None
    cp_handler: Union[None, TorchContextParallelConfig] = None
    sp_handler: Union[None, DeepSpeedSequenceParallelConfig] = None

    device_mesh = None

    def __repr__(self):
        return (
            "ParallelismConfig(\n "
            f"\tdp_replicate_size={self.dp_replicate_size},\n"
            f"\tdp_shard_size={self.dp_shard_size},\n"
            f"\ttp_size={self.tp_size},\n"
            f"\tcp_size={self.cp_size},\n"
            f"\tcp_backend={self.cp_backend},\n"
            f"\tsp_size={self.sp_size},\n"
            f"\tsp_backend={self.sp_backend},\n"
            f"\ttotal_size={self.total_size}\n"
            f"\ttp_handler={self.tp_handler},\n"
            f"\tcp_handler={self.cp_handler})\n"
        )

    def to_json(self):
        import copy

        _non_serializable_fields = ["device_mesh"]

        copy.deepcopy(
            {
                k: copy.deepcopy(v.__dict__) if hasattr(v, "__dict__") else v
                for k, v in self.__dict__.items()
                if k not in _non_serializable_fields
            }
        )

    @property
    def dp_dim_names(self):
        """Names of enabled dimensions across which data parallelism is applied."""
        dims = []
        if self.dp_replicate_enabled:
            dims += ["dp_replicate"]
        if self.dp_shard_enabled:
            dims += ["dp_shard"]
        return dims

    @property
    def non_dp_dim_names(self):
        """Names of enabled dimensions which will receive the same batch (non-data parallel dimensions)."""
        dims = []
        if self.tp_enabled:
            dims += ["tp"]
        if self.cp_enabled:
            dims += ["cp"]
        if self.sp_enabled:
            dims += ["sp"]
        return dims

    @property
    def dp_shard_cp_dim_names(self):
        """Names of enabled dimensions which will be flattened into a joint mesh across which is model sharded in FSDP."""
        dims = []
        if self.dp_shard_enabled:
            dims += ["dp_shard"]
        if self.cp_enabled:
            dims += ["cp"]
        return dims

    @property
    def dp_cp_dim_names(self):
        """Names of enabled dimensions across which loss should be averaged"""
        dims = []
        if self.dp_replicate_enabled:
            dims += ["dp_replicate"]
        if self.dp_shard_enabled:
            dims += ["dp_shard"]
        if self.cp_enabled:
            dims += ["cp"]
        return dims

    @property
    def fsdp_dim_names(self):
        """Names of enabled dimensions across which FSDP is applied, including data parallel replication."""
        dims = []
        if self.dp_replicate_enabled:
            dims += ["dp_replicate"]
        dims += ["dp_shard_cp"]
        return dims

    @property
    def total_size(self):
        """The total size of the parallelism configuration, which is the product of all sizes."""
        return self.dp_replicate_size * self.dp_shard_size * self.tp_size * self.cp_size * self.sp_size

    @property
    def non_data_parallel_size(self):
        """The size of the non-data parallel dimensions, which is the product of tensor and context parallel sizes."""
        return self.tp_size * self.cp_size * self.sp_size

    @property
    def data_parallel_size(self):
        """The size of the data parallel dimensions, which is the product of data parallel replication and"""
        return self.dp_replicate_size * self.dp_shard_size

    @property
    def dp_replicate_enabled(self):
        """True if data parallel replication is enabled, i.e. `dp_replicate_size > 1`."""
        return self.dp_replicate_size > 1

    @property
    def dp_shard_enabled(self):
        """True if data parallel sharding is enabled, i.e. `dp_shard_size > 1`."""
        return self.dp_shard_size > 1

    @property
    def tp_enabled(self):
        """True if tensor parallelism is enabled, i.e. `tp_size > 1`."""
        return self.tp_size > 1

    @property
    def cp_enabled(self):
        """True if context parallelism is enabled, i.e. `cp_size > 1`."""
        return self.cp_size > 1

    @property
    def sp_enabled(self):
        """True if context parallelism is enabled, i.e. `sp_size > 1`."""
        return self.sp_size > 1

    @property
    def active_mesh_dims(self):
        """Names of all active mesh dimensions."""
        return self.dp_dim_names + self.non_dp_dim_names

    def build_device_mesh(self, device_type: str):
        """Builds a device mesh for the given device type based on the parallelism configuration.
        This method will also create required joint meshes (e.g. `dp_shard_cp`, `dp_cp`, `dp`).

        Args:
            device_type (`str`): The type of device for which to build the mesh, e
        """
        # Skip mesh creation for DeepSpeed SP - DeepSpeed handles its own SP groups
        # Only skip when SP is actually enabled (sp_size > 1), otherwise user might still want TP/CP/FSDP
        if self.sp_backend == "deepspeed" and self.sp_size > 1:
            return None

        if is_torch_version(">=", "2.2.0"):
            from torch.distributed.device_mesh import init_device_mesh
        else:
            raise RuntimeError("Building a device_mesh requires to have torch>=2.2.0")

        mesh = self._get_mesh()
        if len(mesh) == 0:
            return None
        mesh_dim_names, mesh_shape = mesh
        device_mesh = init_device_mesh(
            device_type,
            mesh_shape,
            mesh_dim_names=mesh_dim_names,
        )
        if self.dp_dim_names:
            device_mesh[self.dp_dim_names]._flatten("dp")
        if self.dp_shard_cp_dim_names:
            device_mesh[self.dp_shard_cp_dim_names]._flatten("dp_shard_cp")
        if self.dp_cp_dim_names:
            device_mesh[self.dp_cp_dim_names]._flatten("dp_cp")

        return device_mesh

    def get_device_mesh(self, device_type: Optional[str] = None):
        if self.device_mesh is None:
            if device_type is not None:
                self.device_mesh = self.build_device_mesh(device_type)
            else:
                raise ("You need to pass a device_type e.g cuda to build the device mesh")
        else:
            if device_type is not None:
                if self.device_mesh.device_type != device_type:
                    raise ValueError(
                        f"The device_mesh is already created with device type {self.device_mesh.device_type}. However, you are trying to get a device mesh with device_type {device_type}. Please check if you correctly initialized your device_mesh"
                    )
        return self.device_mesh

    def _get_mesh(self) -> tuple[tuple[int, ...], tuple[str, ...]]:
        """Generate mesh shape and dimension names for torch.distributed.init_device_mesh()."""

        # Build mesh dimensions dictionary
        mesh_dims = {parallelism: self._sizes[parallelism] for parallelism in self.active_mesh_dims}

        # Apply canonical ordering
        mesh_order = ["dp_replicate", "dp_shard", "cp", "sp", "tp"]
        sorted_items = sorted(
            mesh_dims.items(),
            key=lambda x: (mesh_order.index(x[0])),
        )
        return tuple(zip(*sorted_items))

    def __post_init__(self):
        # Basic size validation
        if self.dp_replicate_size is None:
            self.dp_replicate_size = int(os.environ.get("PARALLELISM_CONFIG_DP_REPLICATE_SIZE", "1"))
        if self.dp_shard_size is None:
            self.dp_shard_size = int(os.environ.get("PARALLELISM_CONFIG_DP_SHARD_SIZE", "1"))
        if self.tp_size is None:
            self.tp_size = int(os.environ.get("PARALLELISM_CONFIG_TP_SIZE", "1"))
        if self.cp_size is None:
            self.cp_size = int(os.environ.get("PARALLELISM_CONFIG_CP_SIZE", "1"))
        if self.cp_backend is None:
            self.cp_backend = os.environ.get("PARALLELISM_CONFIG_CP_BACKEND", "torch")
        if self.sp_size is None:
            self.sp_size = int(os.environ.get("PARALLELISM_CONFIG_SP_SIZE", "1"))
        if self.sp_backend is None:
            self.sp_backend = os.environ.get("PARALLELISM_CONFIG_SP_BACKEND", "deepspeed")

        if self.tp_size > 1:
            if self.tp_handler is None:
                self.tp_handler = TorchTensorParallelConfig()

        if self.cp_size > 1:
            if self.cp_handler is None:
                self.cp_handler = TorchContextParallelConfig()
            else:
                cp_backends_config_map = dict(
                    torch=TorchContextParallelConfig,
                )
                if not isinstance(self.cp_handler, cp_backends_config_map[self.cp_backend]):
                    raise ValueError(
                        f"ParallelismConfig's cp_backend={self.cp_backend} requires {cp_backends_config_map[self.cp_backend]}, but cp_handler was set to {type(self.cp_handler)}"
                    )

        if self.sp_size > 1:
            if self.sp_handler is None:
                self.sp_handler = DeepSpeedSequenceParallelConfig()
        if self.dp_replicate_size < 1:
            raise ValueError(f"dp_replicate_size must be at least 1, but got {self.dp_replicate_size}")
        if self.dp_shard_size < 1:
            raise ValueError(f"dp_shard_size must be at least 1, but got {self.dp_shard_size}")
        if self.tp_size < 1:
            raise ValueError(f"tp_size must be at least 1, but got {self.tp_size}")
        if self.cp_size < 1:
            raise ValueError(f"cp_size must be at least 1, but got {self.cp_size}")
        valid_cp_backends = ["torch"]
        if self.cp_backend not in valid_cp_backends:
            raise ValueError(f"cp_backend must be one of {valid_cp_backends}, but got {self.cp_backend}")

        if self.sp_size < 1:
            raise ValueError(f"sp_size must be at least 1, but got {self.sp_size}")
        valid_sp_backends = ["deepspeed"]
        if self.sp_backend not in valid_sp_backends:
            raise ValueError(f"sp_backend must be one of {valid_sp_backends}, but got {self.sp_backend}")

        # CP and SP are mutually exclusive
        if self.cp_size > 1 and self.sp_size > 1:
            raise ValueError(
                "Context Parallelism (CP) and Sequence Parallelism (SP) are mutually exclusive. "
                f"Got cp_size={self.cp_size} and sp_size={self.sp_size}. "
                "Please set either cp_size=1 or sp_size=1."
            )

        if (self.tp_size > 1 or self.cp_size > 1) and self.dp_replicate_size > 1 and self.dp_shard_size == 1:
            raise ValueError(
                "Tensor/Context parallelism (tp/cp_size > 1) cannot be used with pure data parallelism (dp_replicate_size > 1 and dp_shard_size == 1). "
                "Please set dp_shard_size > 1 and dp_replicate_size == 1 to compose FSDP + TP/CP for 2D parallel, "
                "or set dp_replicate_size == 1 and dp_shard_size > 1 to compose HSDP + TP/CP for 3D parallel."
            )
        self._sizes = {
            "dp_replicate": self.dp_replicate_size,
            "dp_shard": self.dp_shard_size,
            "tp": self.tp_size,
            "cp": self.cp_size,
            "sp": self.sp_size,
        }

    def _set_size(self, parallelism: str, size: int):
        assert parallelism in self._sizes.keys(), f"Parallelism must be one of {self._sizes.keys()}"
        self._sizes[parallelism] = size
        setattr(self, f"{parallelism}_size", size)

    def _validate_accelerator(self, accelerator: "Accelerator"):
        _warnings = set()
        if not accelerator.multi_device and self.total_size == 1:
            # No distributed setup, valid parallelism config
            return

        # We need this to ensure DDP works
        if self.total_size == 1:
            self._set_size("dp_replicate", accelerator.num_processes)

        # For DeepSpeed SP, DeepSpeed handles global process groups internally.
        # Skip the total_size == num_processes validation since:
        # 1. DeepSpeed manages SP groups globally via initialize_sequence_parallel()
        # 2. num_processes is per-node in multi-node, but total_size is local parallelism config
        # 3. The actual global parallelism (SP × DP) is handled by DeepSpeed's process groups
        if self.sp_backend == "deepspeed" and self.sp_size > 1:
            pass
        elif self.total_size != accelerator.num_processes:
            raise ValueError(
                f"ParallelismConfig total_size ({self.total_size}) does not match "
                f"num_processes ({accelerator.num_processes}). Please adjust dp_replicate_size/ "
                f"dp_shard_size/tp_size/cp_size/sp_size."
            )

        if self.total_size > 1 and not (
            accelerator.is_fsdp2
            or accelerator.multi_device
            or accelerator.distributed_type == DistributedType.DEEPSPEED
        ):
            raise ValueError(
                f"ParallelismConfig is only compatible DistributedType.FSDP (version 2) or DistributedType.Multi{{Device}} or DistributedType.DEEPSPEED, but got {accelerator.distributed_type}."
            )

        for parallelism, size in self._sizes.items():
            if size == 1 and getattr(self, f"{parallelism}_handler", None) is not None:
                _warnings.add(
                    f"ParallelismConfig.{parallelism}_handler is set, but {parallelism}_size is set to 1. This handler will be ignored."
                )

        if _warnings and accelerator.is_main_process:
            warnings.warn(
                "ParallelismConfig has the following warnings:\n" + "\n".join(_warnings),
                UserWarning,
            )


================================================
FILE: src/accelerate/scheduler.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# We ignore warnings about stepping the scheduler since we step it ourselves during gradient accumulation

import warnings

from .state import AcceleratorState, GradientState


warnings.filterwarnings("ignore", category=UserWarning, module="torch.optim.lr_scheduler")


class AcceleratedScheduler:
    """
    A wrapper around a learning rate scheduler that will only step when the optimizer(s) have a training step. Useful
    to avoid making a scheduler step too fast when gradients went overflow and there was no training step (in mixed
    precision training)

    When performing gradient accumulation scheduler lengths should not be changed accordingly, Accelerate will always
    step the scheduler to account for it.

    Args:
        scheduler (`torch.optim.lr_scheduler._LRScheduler`):
            The scheduler to wrap.
        optimizers (one or a list of `torch.optim.Optimizer`):
            The optimizers used.
        step_with_optimizer (`bool`, *optional*, defaults to `True`):
            Whether or not the scheduler should be stepped at each optimizer step.
        split_batches (`bool`, *optional*, defaults to `False`):
            Whether or not the dataloaders split one batch across the different processes (so batch size is the same
            regardless of the number of processes) or create batches on each process (so batch size is the original
            batch size multiplied by the number of processes).
    """

    def __init__(self, scheduler, optimizers, step_with_optimizer: bool = True, split_batches: bool = False):
        self.scheduler = scheduler
        self.optimizers = optimizers if isinstance(optimizers, (list, tuple)) else [optimizers]
        self.split_batches = split_batches
        self.step_with_optimizer = step_with_optimizer
        self.gradient_state = GradientState()

    def step(self, *args, **kwargs):
        if not self.step_with_optimizer:
            # No link between scheduler and optimizer -> just step
            self.scheduler.step(*args, **kwargs)
            return

        # Otherwise, first make sure the optimizer was stepped.
        if not self.gradient_state.sync_gradients:
            if self.gradient_state.adjust_scheduler:
                self.scheduler._step_count += 1
            return

        for opt in self.optimizers:
            if opt.step_was_skipped:
                return
        if self.split_batches:
            # Split batches -> the training dataloader batch size is not changed so one step per training step
            self.scheduler.step(*args, **kwargs)
        else:
            # Otherwise the training dataloader batch size was multiplied by `num_processes`, so we need to do
            # num_processes steps per training step
            num_processes = AcceleratorState().num_processes
            for _ in range(num_processes):
                # Special case when using OneCycle and `drop_last` was not used
                if hasattr(self.scheduler, "total_steps"):
                    if self.scheduler._step_count <= self.scheduler.total_steps:
                        self.scheduler.step(*args, **kwargs)
                else:
                    self.scheduler.step(*args, **kwargs)

    # Passthroughs
    def get_last_lr(self):
        return self.scheduler.get_last_lr()

    def state_dict(self):
        return self.scheduler.state_dict()

    def load_state_dict(self, state_dict):
        self.scheduler.load_state_dict(state_dict)

    def get_lr(self):
        return self.scheduler.get_lr()

    def print_lr(self, *args, **kwargs):
        return self.scheduler.print_lr(*args, **kwargs)


================================================
FILE: src/accelerate/state.py
================================================
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import logging
import os
import threading
import warnings
import weakref
from contextlib import contextmanager
from functools import partial
from typing import Any, Callable

import torch

from .utils import (
    DistributedType,
    DynamoBackend,
    GradientAccumulationPlugin,
    check_cuda_fp8_capability,
    check_cuda_p2p_ib_support,
    deepspeed_required,
    get_cpu_distributed_information,
    get_int_from_env,
    is_datasets_available,
    is_deepspeed_available,
    is_fp8_available,
    is_habana_gaudi1,
    is_hpu_available,
    is_mlu_available,
    is_mps_available,
    is_musa_available,
    is_neuron_available,
    is_npu_available,
    is_sdaa_available,
    is_torch_xla_available,
    is_xccl_available,
    is_xpu_available,
    parse_choice_from_env,
    parse_flag_from_env,
    set_numa_affinity,
)
from .utils.dataclasses import SageMakerDistributedType


if is_torch_xla_available():
    import torch_xla.core.xla_model as xm
    import torch_xla.runtime as xr

if is_mlu_available(check_device=False):
    import torch_mlu  # noqa: F401

if is_sdaa_available(check_device=False):
    import torch_sdaa  # noqa: F401

if is_musa_available(check_device=False):
    import torch_musa  # noqa: F401

if is_npu_available(check_device=False):
    import torch_npu  # noqa: F401


logger = logging.getLogger(__name__)


def is_initialized() -> bool:
    """
    Checks if the `AcceleratorState` has been initialized from `Accelerator`. Same as `AcceleratorState.initialized`,
    but works as a module method.
    """
    return AcceleratorState._shared_state != {}


# Lambda function that does nothing
def do_nothing(*args, **kwargs):
    return None


class ThreadLocalSharedDict(threading.local):
    """
    Descriptor that holds a dict shared between instances of a class in the same thread.

    Note: Descriptors have slightly different semantics than just a dict field on its own.
    `PartialState(...)._shared_state` and `PartialState._shared_state` (instance vs class) give the same value: the
    underlying _storage dict. Likewise, `PartialState(...)._shared_state = {...}` overrides the _storage dict inside
    the descriptor as you would expect. However, `PartialState._shared_state = {}` actually replaces the descriptor
    object with a dict instead Thus, you should modify the _storage dict in-place (e.g. `_shared_state.clear()`).

    See Python documentation for an explanation of descriptors: https://docs.python.org/3/howto/descriptor.html

    This is required for using PyTorch/XLA with PJRT in multithreaded mode (required for TPU v2 and v3).

    See https://github.com/pytorch/xla/blob/r2.0/docs/pjrt.md#multithreading-on-tpu-v2v3
    """

    def __init__(self, thread_local: bool = False):
        self._storage = {}

    def __get__(self, obj, objtype=None):
        return self._storage

    def __set__(self, obj, value):
        self._storage = value


# Prefer global shared dictionary, except when using TPU.
SharedDict = dict if not is_torch_xla_available() else ThreadLocalSharedDict


# Inspired by Alex Martelli's 'Borg'.
class PartialState:
    """
    Singleton class that has information about the current training environment and functions to help with process
    control. Designed to be used when only process control and device execution states are needed. Does *not* need to
    be initialized from `Accelerator`.

    Args:
        cpu (`bool`, *optional*):
            Whether or not to force the script to execute on CPU. Will ignore any accelerators available if set to
            `True` and force the execution on the CPU.
        kwargs (additional keyword arguments, *optional*):
            Additional keyword arguments to pass to the relevant `init_process_group` function. Valid `kwargs` can be
            found in [`utils.InitProcessGroupKwargs`]. See the example section for detailed usage.

    **Available attributes:**

        - **device** (`torch.device`) -- The device to use.
        - **distributed_type** ([`~accelerate.state.DistributedType`]) -- The type of distributed environment currently
          in use.
        - **local_process_index** (`int`) -- The index of the current process on the current server.
        - **mixed_precision** (`str`) -- Whether or not the current script will use mixed precision, and if so the type
          of mixed precision being performed. (Choose from 'no','fp16','bf16 or 'fp8').
        - **num_processes** (`int`) -- The number of processes currently launched in parallel.
        - **process_index** (`int`) -- The index of the current process.
        - **is_last_process** (`bool`) -- Whether or not the current process is the last one.
        - **is_main_process** (`bool`) -- Whether or not the current process is the main one.
        - **is_local_main_process** (`bool`) -- Whether or not the current process is the main one on the local node.
        - **debug** (`bool`) -- Whether or not the current script is being run in debug mode.

    Example:
    ```python
    from accelerate.utils import InitProcessGroupKwargs

    # To include `InitProcessGroupKwargs`, init then call `.to_kwargs()`
    kwargs = InitProcessGroupKwargs(...).to_kwargs()
    state = PartialState(**kwargs)
    ```
    """

    _shared_state = SharedDict()
    _known_attrs = [
        "_cpu",
        "_mixed_precision",
        "_shared_state",
        "backend",
        "debug",
        "device",
        "distributed_type",
        "fork_launched",
        "local_process_index",
        "num_processes",
        "process_index",
    ]

    def __init__(self, cpu: bool = False, **kwargs):
        self.__dict__ = self._shared_state
        if not self.initialized:
            self._cpu = cpu
            self.backend = None
            env_device = os.environ.get("ACCELERATE_TORCH_DEVICE", None)
            self.device = torch.device(env_device) if env_device is not None else None
            self.debug = parse_flag_from_env("ACCELERATE_DEBUG_MODE")
            use_sagemaker_dp = kwargs.pop("_use_sagemaker_dp", None)
            dist_information = None
            if use_sagemaker_dp is None:
                use_sagemaker_dp = (
                    os.environ.get("ACCELERATE_USE_SAGEMAKER", "false").lower() == "true"
                    and os.environ.get("ACCELERATE_SAGEMAKER_DISTRIBUTED_TYPE") != SageMakerDistributedType.NO
                )

            # Sets up self.backend + imports
            original_backend = kwargs.pop("backend", None)
            backend, distributed_type = self._prepare_backend(cpu, use_sagemaker_dp, original_backend)
            if original_backend is not None and backend != original_backend:
                raise ValueError(f"Your assigned backend {original_backend} is not available, please use {backend}")
            self.backend = backend
            self.distributed_type = distributed_type
            use_deepspeed = False
            if not cpu and self.backend != "xla":
                if int(os.environ.get("LOCAL_RANK", -1)) != -1:
                    # Deal with spawning deepspeed
                    if os.environ.get("ACCELERATE_USE_DEEPSPEED", "false").lower() == "true":
                        if not is_deepspeed_available():
                            raise ImportError(
                                "DeepSpeed is not available => install it using `pip3 install deepspeed` or build it from source"
                            )
                        from deepspeed import comm as dist

                        if not dist.is_initialized():
                            if self.backend == "tccl":
                                local_rank = os.environ.get("LOCAL_RANK", -1)
                                torch.sdaa.set_device(f"sdaa:{local_rank}")
                            dist.init_distributed(dist_backend=self.backend, auto_mpi_discovery=False, **kwargs)
                        # We need to flag to `use_deepspeed` to be True to override `distributed_type` later
                        use_deepspeed = True
                    # Deal with all other backends but CPU, that gets handled special later
                    elif (
                        self.distributed_type is not DistributedType.MULTI_CPU
                        and not torch.distributed.is_initialized()
                    ):
                        if self.backend == "tccl":
                            local_rank = os.environ.get("LOCAL_RANK", -1)
                            torch.sdaa.set_device(f"sdaa:{local_rank}")
                        if (
                            self.backend == "nccl"
                            and os.environ.get("ACCELERATE_USE_FSDP", "false").lower() == "true"
                            and (
                                os.environ.get("FSDP_OFFLOAD_PARAMS", "false").lower() == "true"
                                or os.environ.get("FSDP_STATE_DICT_TYPE", "SHARDED_STATE_DICT") == "FULL_STATE_DICT"
                            )
                        ):
                            self.backend = "cuda:nccl,cpu:gloo"
                        if (
                            self.backend == "xccl"
                            and os.environ.get("ACCELERATE_USE_FSDP", "false").lower() == "true"
                            and (
                                os.environ.get("FSDP_OFFLOAD_PARAMS", "false").lower() == "true"
                                or os.environ.get("FSDP_STATE_DICT_TYPE", "SHARDED_STATE_DICT") == "FULL_STATE_DICT"
                            )
                        ):
                            self.backend = "xpu:xccl,cpu:gloo"
                        torch.distributed.init_process_group(backend=self.backend, **kwargs)

            # CPU require special env configs to be set
            if self.distributed_type == DistributedType.MULTI_CPU:
                dist_information = get_cpu_distributed_information()
                os.environ["RANK"] = str(dist_information.rank)
                os.environ["WORLD_SIZE"] = str(dist_information.world_size)
                os.environ["LOCAL_RANK"] = str(dist_information.local_rank)
                os.environ["LOCAL_WORLD_SIZE"] = str(dist_information.local_world_size)
                if not os.environ.get("MASTER_PORT", None):
                    os.environ["MASTER_PORT"] = "29500"
                if (
                    not os.environ.get("MASTER_ADDR", None)
                    and dist_information.local_world_size != dist_information.world_size
                    and self.backend != "mpi"
                ):
                    raise ValueError(
                        "Tried to launch on distributed with multinode, but `MASTER_ADDR` env was not set, "
                        "please try exporting rank 0's hostname as `MASTER_ADDR`"
                    )
                kwargs["rank"] = dist_information.rank
                kwargs["world_size"] = dist_information.world_size

                if (
                    self.distributed_type == DistributedType.MULTI_CPU
                    and get_int_from_env(["OMP_NUM_THREADS"], 0) == 0
                ):
                    import psutil

                    num_cpu_threads_per_process = int(
                        psutil.cpu_count(logical=False) / dist_information.local_world_size
                    )
                    if num_cpu_threads_per_process == 0:
                        num_cpu_threads_per_process = 1
                    torch.set_num_threads(num_cpu_threads_per_process)
                    warnings.warn(
                        f"OMP_NUM_THREADS/MKL_NUM_THREADS unset, we set it at {num_cpu_threads_per_process} to improve oob"
                        " performance."
                    )

                if not torch.distributed.is_initialized():
                    torch.distributed.init_process_group(backend=self.backend, **kwargs)

            # No backend == no distributed training
            if self.backend is None:
                self.distributed_type = DistributedType.NO
                self.num_processes = 1
                self.process_index = 0
                self.local_process_index = 0
            elif self.backend == "xla":
                # XLA needs device setting first for `set_replication`
                self.set_device()
                xm.set_replication(self.device, xm.get_xla_supported_devices())
                self.num_processes = xr.world_size()
                self.process_index = xr.global_ordinal()
                if is_torch_xla_available(check_is_tpu=True):
                    self.local_process_index = xm.get_local_ordinal()
                else:
                    self.local_process_index = int(os.environ.get("LOCAL_RANK", -1))
            else:
                self.num_processes = torch.distributed.get_world_size()
                self.process_index = torch.distributed.get_rank()
                self.local_process_index = (
                    int(os.environ.get("LOCAL_RANK", -1)) if dist_information is None else dist_information.local_rank
                )
            self.set_device()
            # Now we can change to deepseed
            if use_deepspeed:
                self.distributed_type = DistributedType.DEEPSPEED

            # Set CPU affinity if enabled
            if parse_flag_from_env("ACCELERATE_CPU_AFFINITY", False):
                set_numa_affinity(self.local_process_index)

            # Check for old RTX 4000's that can't use P2P or IB and are on old drivers
            if self.device.type == "cuda" and not check_cuda_p2p_ib_support():
                if "NCCL_P2P_DISABLE" not in os.environ or "NCCL_IB_DISABLE" not in os.environ:
                    raise NotImplementedError(
                        "Using RTX 4000 series doesn't support faster communication broadband via P2P or IB. "
                        'Please set `NCCL_P2P_DISABLE="1"` and `NCCL_IB_DISABLE="1" or use `accelerate launch` which '
                        "will do this automatically."
                    )

        # Important: This should be the *only* code outside of `self.initialized!`
        self.fork_launched = parse_flag_from_env("FORK_LAUNCHED", 0)

    def __repr__(self) -> str:
        return (
            f"Distributed environment: {self.distributed_type}{('  Backend: ' + self.backend) if self.backend else ''}\n"
            f"Num processes: {self.num_processes}\n"
            f"Process index: {self.process_index}\n"
            f"Local process index: {self.local_process_index}\n"
            f"Device: {self.device}\n"
        )

    @staticmethod
    def _reset_state():
        "Resets `_shared_state`, is used internally and should not be called"
        PartialState._shared_state.clear()

    @property
    def initialized(self) -> bool:
        "Returns whether the `PartialState` has been initialized"
        return self._shared_state != {}

    @property
    def use_distributed(self):
        """
        Whether the Accelerator is configured for distributed training
        """
        return self.distributed_type != DistributedType.NO and self.num_processes > 1

    @property
    def is_last_process(self) -> bool:
        "Returns whether the current process is the last one"
        return self.process_index == self.num_processes - 1

    @property
    def is_main_process(self) -> bool:
        "Returns whether the current process is the main process"
        return (
            self.process_index == 0 if self.distributed_type != DistributedType.MEGATRON_LM else self.is_last_process
        )

    @property
    def is_local_main_process(self) -> bool:
        "Returns whether the current process is the main process on the local node"
        return (
            self.local_process_index == 0
            if self.distributed_type != DistributedType.MEGATRON_LM
            else self.is_last_process
        )

    def wait_for_everyone(self):
        """
        Will stop the execution of the current process until every other process has reached that point (so this does
        nothing when the script is only run in one process). Useful to do before saving a model.

        Example:

        ```python
        >>> # Assuming two GPU processes
        >>> import time
        >>> from accelerate.state import PartialState

        >>> state = PartialState()
        >>> if state.is_main_process:
        ...     time.sleep(2)
        >>> else:
        ...     print("I'm waiting for the main process to finish its sleep...")
        >>> state.wait_for_everyone()
        >>> # Should print on every process at the same time
        >>> print("Everyone is here")
        ```
        """
        if self.distributed_type in (
            DistributedType.MULTI_GPU,
            DistributedType.MULTI_MLU,
            DistributedType.MULTI_SDAA,
            DistributedType.MULTI_MUSA,
            DistributedType.MULTI_NPU,
            DistributedType.MULTI_XPU,
            DistributedType.MULTI_CPU,
            DistributedType.MULTI_HPU,
            DistributedType.MULTI_NEURON,
            DistributedType.DEEPSPEED,
            DistributedType.FSDP,
        ):
            torch.distributed.barrier(device_ids=[self.local_process_index])
        elif self.distributed_type == DistributedType.XLA:
            xm.rendezvous("accelerate.utils.wait_for_everyone")

    def _goes_first(self, is_main: bool):
        if not is_main:
            self.wait_for_everyone()

        yield

        if is_main:
            self.wait_for_everyone()

    @contextmanager
    def split_between_processes(self, inputs: list | tuple | dict | torch.Tensor, apply_padding: bool = False):
        """
        Splits `input` between `self.num_processes` quickly and can be then used on that process. Useful when doing
        distributed inference, such as with different prompts.

        Note that when using a `dict`, all keys need to have the same number of elements.

        Args:
            inputs (`list`, `tuple`, `torch.Tensor`, `dict` of `list`/`tuple`/`torch.Tensor`, or `datasets.Dataset`):
                The input to split between processes.
            apply_padding (`bool`, `optional`, defaults to `False`):
                Whether to apply padding by repeating the last element of the input so that all processes have the same
                number of elements. Useful when trying to perform actions such as `gather()` on the outputs or passing
                in less inputs than there are processes. If so, just remember to drop the padded elements afterwards.


        Example:

        ```python
        # Assume there are two processes
        from accelerate import PartialState

        state = PartialState()
        with state.split_between_processes(["A", "B", "C"]) as inputs:
            print(inputs)
        # Process 0
        ["A", "B"]
        # Process 1
        ["C"]

        with state.split_between_processes(["A", "B", "C"], apply_padding=True) as inputs:
            print(inputs)
        # Process 0
        ["A", "B"]
        # Process 1
        ["C", "C"]
        ```
        """
        if self.num_processes == 1:
            yield inputs
            return
        length = len(inputs)
        # Nested dictionary of any types
        if isinstance(inputs, dict):
            length = len(inputs[list(inputs.keys())[0]])
            if not all(len(v) == length for v in inputs.values()):
                raise ValueError("All values in the dictionary must have the same length")
        num_samples_per_process, num_extras = divmod(length, self.num_processes)
        start_index = self.process_index * num_samples_per_process + min(self.process_index, num_extras)
        end_index = start_index + num_samples_per_process + (1 if self.process_index < num_extras else 0)

        def _split_values(inputs, start_index, end_index):
            if isinstance(inputs, (list, tuple, torch.Tensor)):
                if start_index >= len(inputs):
                    result = inputs[-1:]
                else:
                    result = inputs[start_index:end_index]
                if apply_padding:
                    if isinstance(result, torch.Tensor):
                        from accelerate.utils import pad_across_processes, send_to_device

                        # The tensor needs to be on the device before we can pad it
                        tensorized_result = send_to_device(result, self.device)
                        result = pad_across_processes(tensorized_result, pad_index=inputs[-1])
                    else:
                        result += [result[-1]] * (num_samples_per_process + (1 if num_extras > 0 else 0) - len(result))
                return result
            elif isinstance(inputs, dict):
                for key in inputs.keys():
                    inputs[key] = _split_values(inputs[key], start_index, end_index)
                return inputs
            else:
                if is_datasets_available():
                    from datasets import Dataset

                    if isinstance(inputs, Dataset):
                        if start_index >= len(inputs):
                            start_index = len(inputs) - 1
                        if end_index > len(inputs):
                            end_index = len(inputs)
                        result_idcs = list(range(start_index, end_index))
                        if apply_padding:
                            result_idcs += [end_index - 1] * (
                                num_samples_per_process + (1 if num_extras > 0 else 0) - len(result_idcs)
                            )
                        return inputs.select(result_idcs)
                return inputs

        yield _split_values(inputs, start_index, end_index)

    @contextmanager
    def main_process_first(self):
        """
        Lets the main process go first inside a with block.

        The other processes will enter the with block after the main process exits.

        Example:

        ```python
        >>> from accelerate import Accelerator

        >>> accelerator = Accelerator()
        >>> with accelerator.main_process_first():
        ...     # This will be printed first by process 0 then in a seemingly
        ...     # random order by the other processes.
        ...     print(f"This will be printed by process {accelerator.process_index}")
        ```
        """
        yield from self._goes_first(self.is_main_process)

    @contextmanager
    def local_main_process_first(self):
        """
        Lets the local main process go inside a with block.

        The other processes will enter the with block after the main process exits.

        Example:

        ```python
        >>> from accelerate.state import PartialState

        >>> state = PartialState()
        >>> with state.local_main_process_first():
        ...     # This will be printed first by local process 0 then in a seemingly
        ...     # random order by the other processes.
        ...     print(f"This will be printed by process {state.local_process_index}")
        ```
        """
        yield from self._goes_first(self.is_local_main_process)

    def on_main_process(self, function: Callable[..., Any] | None = None):
        """
        Decorator that only runs the decorated function on the main process.

        Args:
            function (`Callable`): The function to decorate.

        Example:

        ```python
        >>> from accelerate.state import PartialState

        >>> state = PartialState()


        >>> @state.on_main_process
        ... def print_something():
        ...     print("This will be printed by process 0 only.")


        >>> print_something()
        "This will be printed by process 0 only"
        ```
        """
        if not self.initialized:
            raise ValueError("The `PartialState` or `Accelerator` must be initialized before calling this function.")
        if self.is_main_process or not self.use_distributed:
            return function
        return do_nothing

    def on_local_main_process(self, function: Callable[..., Any] | None = None):
        """
        Decorator that only runs the decorated function on the local main process.

        Args:
            function (`Callable`): The function to decorate.

        Example:
        ```python
        # Assume we have 2 servers with 4 processes each.
        from accelerate.state import PartialState

        state = PartialState()


        @state.on_local_main_process
        def print_something():
            print("This will be printed by process 0 only on each server.")


        print_something()
        # On server 1:
        "This will be printed by process 0 only"
        # On server 2:
        "This will be printed by process 0 only"
        ```
        """
        if self.is_local_main_process or not self.use_distributed:
            return function
        return do_nothing

    def on_last_process(self, function: Callable[..., Any]):
        """
        Decorator that only runs the decorated function on the last process.

        Args:
            function (`Callable`): The function to decorate.

        Example:
        ```python
        # Assume we have 4 processes.
        from accelerate.state import PartialState

        state = PartialState()


        @state.on_last_process
        def print_something():
            print(f"Printed on process {state.process_index}")


        print_something()
        "Printed on process 3"
        ```
        """
        if self.is_last_process or not self.use_distributed:
            return function
        return do_nothing

    def on_process(self, function: Callable[..., Any] | None = None, process_index: int | None = None):
        """
        Decorator that only runs the decorated function on the process with the given index.

        Args:
            function (`Callable`, `optional`):
                The function to decorate.
            process_index (`int`, `optional`):
                The index of the process on which to run the function.

        Example:
        ```python
        # Assume we have 4 processes.
        from accelerate.state import PartialState

        state = PartialState()


        @state.on_process(process_index=2)
        def print_something():
            print(f"Printed on process {state.process_index}")


        print_something()
        "Printed on process 2"
        ```
        """
        if function is None:
            return partial(self.on_process, process_index=process_index)
        if (self.process_index == process_index) or (not self.use_distributed):
            return function
        return do_nothing

    def on_local_process(self, function: Callable[..., Any] | None = None, local_process_index: int | None = None):
        """
        Decorator that only runs the decorated function on the process with the given index on the current node.

        Args:
            function (`Callable`, *optional*):
                The function to decorate.
            local_process_index (`int`, *optional*):
                The index of the local process on which to run the function.

        Example:
        ```python
        # Assume we have 2 servers with 4 processes each.
        from accelerate import Accelerator

        accelerator = Accelerator()


        @accelerator.on_local_process(local_process_index=2)
        def print_something():
            print(f"Printed on process {accelerator.local_process_index}")


        print_something()
        # On server 1:
        "Printed on process 2"
        # On server 2:
        "Printed on process 2"
        ```
        """
        if function is None:
            return partial(self.on_local_process, local_process_index=local_process_index)
        if (self.local_process_index == local_process_index) or (not self.use_distributed):
            return function
        return do_nothing

    def print(self, *args, **kwargs):
        if self.is_local_main_process:
            print(*args, **kwargs)

    @property
    def default_device(self) -> torch.device:
        """
        Returns the default device which is:
        - MPS if `torch.backends.mps.is_available()` and `torch.backends.mps.is_built()` both return True.
        - CUDA if `torch.cuda.is_available()`
        - MLU if `is_mlu_available()`
        - SDAA if `is_sdaa_available()`
        - MUSA if `is_musa_available()`
        - NPU if `is_npu_available()`
        - HPU if `is_hpu_available()`
        - NEURON if `is_neuron_available()`
        - CPU otherwise
        """
        if is_mps_available():
            os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
            return torch.device("mps")
        elif is_mlu_available():
            return torch.device("mlu")
        elif is_sdaa_available():
            return torch.device("sdaa")
        elif is_musa_available():
            return torch.device("musa")
        # NPU should be checked before CUDA when using `transfer_to_npu`
        # See issue #3020: https://github.com/huggingface/accelerate/issues/3020
        elif is_npu_available():
            return torch.device("npu")
        elif is_hpu_available():
            return torch.device("hpu")
        elif torch.cuda.is_available():
            return torch.device("cuda")
        elif is_xpu_available():
            return torch.device("xpu")
        elif is_neuron_available():
            return torch.device("neuron")
        else:
            return torch.device("cpu")

    def _prepare_backend(
        self, cpu: bool = False, sagemaker_dp=False, backend: str | None = None
    ) -> tuple[str, DistributedType]:
        "Prepares any imports needed before initializing the distributed backend and sets `self.backend` properly"
        distributed_type = None
        if sagemaker_dp:
            import smdistributed.dataparallel.torch.torch_smddp  # noqa

            backend = "smddp"
            distributed_type = DistributedType.MULTI_GPU
        elif is_torch_xla_available():
            backend = "xla"
            distributed_type = DistributedType.XLA

        elif int(os.environ.get("LOCAL_RANK", -1)) != -1 and not cpu:
            if is_mlu_available():
                backend = "cncl"
                distributed_type = DistributedType.MULTI_MLU
            if is_sdaa_available():
                backend = "tccl"
                distributed_type = DistributedType.MULTI_SDAA
            elif is_musa_available():
                backend = "mccl"
                distributed_type = DistributedType.MULTI_MUSA
            # NPU should be checked before CUDA when using `transfer_to_npu`
            # See issue #3020: https://github.com/huggingface/accelerate/issues/3020
            elif is_npu_available():
                backend = "hccl"
                distributed_type = DistributedType.MULTI_NPU
            elif is_hpu_available(init_hccl=True):
                if backend is None:
                    backend = "hccl"
                distributed_type = DistributedType.MULTI_HPU
            elif torch.cuda.is_available():
                if backend is None:
                    backend = "nccl"
                distributed_type = DistributedType.MULTI_GPU
            elif is_xpu_available() and is_xccl_available():
                if backend is None:
                    backend = "xccl"
                distributed_type = DistributedType.MULTI_XPU
            elif is_neuron_available():
                backend = "neuron"
                distributed_type = DistributedType.MULTI_NEURON

        if (
            distributed_type is None
            and cpu
            and (
                int(os.environ.get("LOCAL_RANK", -1)) != -1
                or get_int_from_env(["PMI_SIZE", "OMPI_COMM_WORLD_SIZE", "MV2_COMM_WORLD_SIZE", "WORLD_SIZE"], 1) > 1
            )
        ):
            distributed_type = DistributedType.MULTI_CPU

            if backend in (None, "mpi") and torch.distributed.is_mpi_available():
                backend = "mpi"
            else:
                backend = "gloo"
        if distributed_type is None:
            distributed_type = DistributedType.NO

        return backend, distributed_type

    def set_device(self):
        """
        Sets the device in `self.device` to the current distributed environment.
        """
        if self.device is not None:
            return
        if self.distributed_type == DistributedType.NO:
            self.device = torch.device("cpu") if self._cpu else self.default_device
            return
        device = str(self.distributed_type).split(".")[-1].replace("MULTI_", "").lower()
        if device not in ("cpu", "gpu", "mlu", "musa", "npu", "xpu", "xla", "hpu", "sdaa", "neuron"):
            raise ValueError(
                f"Can't set device for {self.distributed_type} ({device}), verify we should be calling `_set_device()` for it!"
            )
        if device == "xla":
            self.device = xm.xla_device()
        elif device == "hpu":
            self.device = torch.device("hpu", torch.hpu.current_device())
        else:
            if device == "gpu":
                device = "cuda"
            device_module = getattr(torch, device)
            device_index = self.local_process_index % device_module.device_count()
            self.device = torch.device(device, device_index)
            device_module.set_device(self.device)

    def destroy_process_group(self, group=None):
        """
        Destroys the process group. If one is not specified, the default process group is destroyed.
        """
        if self.fork_launched and group is None:
            return
        # needed when using torch.distributed.init_process_group
        if torch.distributed.is_initialized():
            torch.distributed.destroy_process_group(group)

    def __getattr__(self, name: str):
        # By this point we know that no attributes of `self` contain `name`,
        # so we just modify the error message
        if name in self._known_attrs:
            raise AttributeError(
                f"`PartialState` object has no attribute `{name}`. "
                "This happens if `PartialState._reset_state()` was called and "
                "an `Accelerator` or `PartialState` was not reinitialized."
            )
        # Raise a typical AttributeError
        raise AttributeError(f"'PartialState' object has no attribute '{name}'")


class AcceleratorState:
    """
    Singleton class that has information about the current training environment.

    **Available attributes:**

        - **device** (`torch.device`) -- The device to use.
        - **distributed_type** ([`~accelerate.state.DistributedType`]) -- The type of distributed environment currently
          in use.
        - **parallelism_config** ([`~accelerate.utils.ParallelismConfig`]) -- The parallelism configuration for the
          current training environment. This is used to configure the distributed training environment.
        - **initialized** (`bool`) -- Whether or not the `AcceleratorState` has been initialized from `Accelerator`.
        - **local_process_index** (`int`) -- The index of the current process on the current server.
        - **mixed_precision** (`str`) -- Whether or not the current script will use mixed precision, and if so the type
          of mixed precision being performed. (Choose from 'no','fp16','bf16 or 'fp8').
        - **num_processes** (`int`) -- The number of processes currently launched in parallel.
        - **process_index** (`int`) -- The index of the current process.
        - **is_last_process** (`bool`) -- Whether or not the current process is the last one.
        - **is_main_process** (`bool`) -- Whether or not the current process is the main one.
        - **is_local_main_process** (`bool`) -- Whether or not the current process is the main one on the local node.
        - **debug** (`bool`) -- Whether or not the current script is being run in debug mode.
    """

    _shared_state = SharedDict()
    _known_attrs = PartialState._known_attrs + [
        "deepspeed_plugin",
        "fsdp_plugin",
        "megatron_lm_plugin",
        "dynamo_plugin",
    ]

    def __init__(
        self,
        mixed_precision: str | None = None,
        cpu: bool = False,
        dynamo_plugin=None,
        deepspeed_plugin=None,
        fsdp_plugin=None,
        torch_tp_plugin=None,
        megatron_lm_plugin=None,
        parallelism_config=None,
        _from_accelerator: bool = False,
        **kwargs,
    ):
        self.__dict__ = self._shared_state
        if parse_flag_from_env("ACCELERATE_USE_CPU"):
            cpu = True
        if PartialState._shared_state == {}:
            PartialState(cpu, **kwargs)
        self.__dict__.update(PartialState._shared_state)
        self._check_initialized(mixed_precision, cpu)
        if not self.initialized:
            self.deepspeed_plugins = None
            self.torch_tp_plugin = torch_tp_plugin
            self.parallelism_config = parallelism_config
            self.device_mesh = None
            mixed_precision = (
                parse_choice_from_env("ACCELERATE_MIXED_PRECISION", "no")
                if mixed_precision is None
                else mixed_precision.lower()
            )
            if mixed_precision == "fp8":
                # this is confusing, why is is_fp8_available only checks for library availability ?
                if not is_fp8_available():
                    raise ValueError(
                        "Using `fp8` precision requires `transformer_engine` or `MS-AMP` to be installed."
                    )
                elif torch.cuda.is_available() and not check_cuda_fp8_capability():
                    logger.warning(
                        f"The current device has compute capability of {torch.cuda.get_device_capability()} which is "
                        "insufficient for FP8 mixed precision training (requires a GPU Hopper/Ada Lovelace "
                        "or higher, compute capability of 8.9 or higher). Will use FP16 instead."
                    )
                    mixed_precision = "fp16"
                elif is_habana_gaudi1():
                    logger.warning(
                        "The current HPU device is Gaudi1 which does not support FP8 mixed precision training (requires "
                        "Gaudi2 or higher). Will use BF16 instead."
                    )
                    mixed_precision = "bf16"

            self.dynamo_plugin = dynamo_plugin
            if not _from_accelerator:
                raise ValueError(
                    "Please make sure to properly initialize your accelerator via `accelerator = Accelerator()` "
                    "before using any functionality from the `accelerate` library."
                )
            # deepspeed handles mixed_precision using deepspeed_config. But we need to set it to fp8
            # if we're using fp8.
            if self.distributed_type == DistributedType.DEEPSPEED and mixed_precision != "fp8":
                self._mixed_precision = "no"
            else:
                self._mixed_precision = mixed_precision

            if self.distributed_type == DistributedType.XLA and is_torch_xla_available(check_is_tpu=True):
                if mixed_precision == "bf16":
                    if os.environ.get("ACCELERATE_DOWNCAST_BF16"):
                        os.environ["XLA_USE_BF16"] = str(0)
                        os.environ["XLA_DOWNCAST_BF16"] = str(1)
                        self.downcast_bfloat = True
                    else:
                        os.environ["XLA_USE_BF16"] = str(1)
                        os.environ["XLA_DOWNCAST_BF16"] = str(0)
                        self.downcast_bfloat = False
            elif os.environ.get("ACCELERATE_USE_DEEPSPEED", "false").lower() == "true" and not cpu:
                self.distributed_type = DistributedType.DEEPSPEED
                if not isinstance(deepspeed_plugin, dict):
                    deepspeed_plugin.set_mixed_precision(mixed_precision)
                    deepspeed_plugin.select(_from_accelerator_state=True)
                else:
                    for plugin in deepspeed_plugin.values():
                        plugin.set_mixed_precision(mixed_precision)
                    # The first plugin passed in is always the active one
                    first_plugin = next(iter(deepspeed_plugin.values()))
                    first_plugin.select(_from_accelerator_state=True)
                self.deepspeed_plugins = deepspeed_plugin
            elif self.distributed_type in [
                DistributedType.MULTI_GPU,
                DistributedType.MULTI_MLU,
                DistributedType.MULTI_SDAA,
                DistributedType.MULTI_MUSA,
                DistributedType.MULTI_NPU,
                DistributedType.MULTI_XPU,
                DistributedType.MULTI_HPU,
                DistributedType.MULTI_NEURON,
            ]:
                # TODO: Siro - remove when axolotl fixes their side
                if not os.environ.get("ACCELERATE_ALLOW_CP_STANDALONE", "false").lower() == "true":
                    if self.parallelism_config and self.parallelism_config.cp_enabled and fsdp_plugin is None:
                        raise ValueError(
                            "`cp_size > 1` specified in the `parallelism_config`, but no `fsdp_plugin` was provided. We need a `fsdp_plugin` to use context parallelism with `cp_backend=torch`, as we also shard the model across the device mesh to save more memory"
                        )
                    if (
                        self.parallelism_config is not None
                        and self.parallelism_config.cp_enabled
                        and fsdp_plugin.fsdp_version == 1
                    ):
                        raise ValueError(
                            "Using `cp_size>1` requires FSDP2, but the provided `fsdp_plugin` is using FSDP1. "
                        )
                if (os.environ.get("ACCELERATE_USE_FSDP", "false").lower() == "true" or fsdp_plugin is not None) or (
                    self.parallelism_config is not None and self.parallelism_config.cp_enabled
                ):
                    self.distributed_type = DistributedType.FSDP
                    if self._mixed_precision != "no" and fsdp_plugin is not None:
                        fsdp_plugin.set_mixed_precision(self._mixed_precision)
                    self.fsdp_plugin = fsdp_plugin
                if os.environ.get(
                    "ACCELERATE_USE_MEGATRON_LM", "false"
                ).lower() == "true" and self.distributed_type not in [
                    DistributedType.MULTI_XPU,
                ]:
                    self.distributed_type = DistributedType.MEGATRON_LM
                    megatron_lm_plugin.set_mixed_precision(self._mixed_precision)
                    self.megatron_lm_plugin = megatron_lm_plugin
            if (
                self.dynamo_plugin.backend != DynamoBackend.NO
                and self._mixed_precision == "no"
                and self.device.type == "cuda"
            ):
                torch.backends.cuda.matmul.allow_tf32 = True
            if (
                self.dynamo_plugin.backend != DynamoBackend.NO
                and self._mixed_precision == "no"
                and self.device.type == "musa"
            ):
                torch.backends.musa.matmul.allow_tf32 = True
            PartialState._shared_state["distributed_type"] = self.distributed_type

    @property
    def initialized(self) -> bool:
        return self._shared_state != PartialState._shared_state

    def __repr__(self):
        repr = PartialState().__repr__() + f"\nMixed precision type: {self.mixed_precision}\n"
        if self.distributed_type == DistributedType.DEEPSPEED:
            repr += f"ds_config: {self.deepspeed_plugin.deepspeed_config}\n"
        return repr

    def _check_initialized(self, mixed_precision=None, cpu=None):
        "Checks if a modification is trying to be made and the `AcceleratorState` has already been initialized"
        if self.initialized:
            err = "AcceleratorState has already been initialized and cannot be changed, restart your runtime completely and pass `{flag}` to `Accelerator()`."
            if cpu and self.device.type != "cpu":
                raise ValueError(err.format(flag="cpu=True"))
            if (
                mixed_precision is not None
                and mixed_precision != self._mixed_precision
                and self.distributed_type != DistributedType.DEEPSPEED
            ):
                raise ValueError(err.format(flag=f"mixed_precision='{mixed_precision}'"))

    @property
    def mixed_precision(self):
        if self.distributed_type == DistributedType.DEEPSPEED and self._mixed_precision != "fp8":
            config = self.deepspeed_plugin.deepspeed_config
            if config.get("fp16", {}).get("enabled", False):
                mixed_precision = "fp16"
            elif config.get("bf16", {}).get("enabled", False):
                mixed_precision = "bf16"
            else:
                mixed_precision = "no"
        else:
            mixed_precision = self._mixed_precision
        return mixed_precision

    @staticmethod
    def _reset_state(reset_partial_state: bool = False):
        "Resets `_shared_state`, is used internally and should not be called"
        AcceleratorState._shared_state.clear()
        if reset_partial_state:
            PartialState._reset_state()

    def destroy_process_group(self, group=None):
        """
        Destroys the process group. If one is not specified, the default process group is destroyed.

        If `self.fork_launched` is `True` and `group` is `None`, nothing happens.
        """
        PartialState().destroy_process_group(group)

    @property
    def fork_launched(self):
        return PartialState().fork_launched

    @property
    def use_distributed(self):
        """
        Whether the Accelerator is configured for distributed training
        """
        return PartialState().use_distributed

    @property
    def is_fsdp2(self) -> bool:
        return self.distributed_type == DistributedType.FSDP and self.fsdp_plugin.fsdp_version == 2

    @property
    def is_last_process(self) -> bool:
        "Returns whether the current process is the last one"
        return PartialState().is_last_process

    @property
    def is_main_process(self) -> bool:
        "Returns whether the current process is the main process"
        return PartialState().is_main_process

    @property
    def is_local_main_process(self) -> bool:
        "Returns whether the current process is the main process on the local node"
        return PartialState().is_local_main_process

    def wait_for_everyone(self):
        PartialState().wait_for_everyone()

    @contextmanager
    def split_between_processes(self, inputs: list | tuple | dict | torch.Tensor, apply_padding: bool = False):
        """
        Splits `input` between `self.num_processes` quickly and can be then used on that process. Useful when doing
        distributed inference, such as with different prompts.

        Note that when using a `dict`, all keys need to have the same number of elements.

        Args:
            inputs (`list`, `tuple`, `torch.Tensor`, or `dict` of `list`/`tuple`/`torch.Tensor`):
                The input to split between processes.
            apply_padding (`bool`, `optional`, defaults to `False`):
                Whether to apply padding by repeating the last element of the input so that all processes have the same
                number of elements. Useful when trying to perform actions such as `gather()` on the outputs or passing
                in less inputs than there are processes. If so, just remember to drop the padded elements afterwards.


        Example:

        ```python
        # Assume there are two processes
        from accelerate.state import AcceleratorState

        state = AcceleratorState()
        with state.split_between_processes(["A", "B", "C"]) as inputs:
            print(inputs)
        # Process 0
        ["A", "B"]
        # Process 1
        ["C"]

        with state.split_between_processes(["A", "B", "C"], apply_padding=True) as inputs:
            print(inputs)
        # Process 0
        ["A", "B"]
        # Process 1
        ["C", "C"]
        ```
        """
        with PartialState().split_between_processes(inputs, apply_padding=apply_padding) as inputs:
            yield inputs

    @contextmanager
    def main_process_first(self):
        """
        Lets the main process go first inside a with block.

        The other processes will enter the with block after the main process exits.
        """
        with PartialState().main_process_first():
            yield

    @contextmanager
    def local_main_process_first(self):
        """
        Lets the local main process go inside a with block.

        The other processes will enter the with block after the main process exits.
        """
        with PartialState().local_main_process_first():
            yield

    @property
    def deepspeed_plugin(self):
        """
        Returns the currently active DeepSpeedPlugin.

        If not using deepspeed, returns `None`.
        """
        # To maintain original behavior, return None if not using deepspeed.
        if self.distributed_type != DistributedType.DEEPSPEED:
            return None
        from accelerate.utils.deepspeed import get_active_deepspeed_plugin

        return get_active_deepspeed_plugin(self)

    @deepspeed_required
    def get_deepspeed_plugin(self, name: str):
        """
        Returns the DeepSpeedPlugin with the given plugin_key.
        """
        return self.deepspeed_plugins[name]

    @deepspeed_required
    def select_deepspeed_plugin(self, name: str | None = None):
        """
        Activates the DeepSpeedPlugin with the given `name`, and will disable all other plugins.
        """
        for key, plugin in self.deepspeed_plugins.items():
            if key != name:
                plugin._unselect()
        self.deepspeed_plugins[name].select(_from_accelerator_state=True)

    def print(self, *args, **kwargs):
        PartialState().print(*args, **kwargs)

    def __getattr__(self, name: str):
        # By this point we know that no attributes of `self` contain `name`,
        # so we just modify the error message
        if name in self._known_attrs:
            raise AttributeError(
                f"`AcceleratorState` object has no attribute `{name}`. "
                "This happens if `AcceleratorState._reset_state()` was called and "
                "an `Accelerator` or `PartialState` was not reinitialized."
            )
        # Raise a typical AttributeError
        raise AttributeError(f"'AcceleratorState' object has no attribute '{name}'")


class GradientState:
    """
    Singleton class that has information related to gradient synchronization for gradient accumulation

    **Available attributes:**

        - **end_of_dataloader** (`bool`) -- Whether we have reached the end the current dataloader
        - **remainder** (`int`) -- The number of extra samples that were added from padding the dataloader
        - **sync_gradients** (`bool`) -- Whether the gradients should be synced across all devices
        - **active_dataloader** (`Optional[DataLoader]`) -- The dataloader that is currently being iterated over
        - **dataloader_references** (`List[Optional[DataLoader]]`) -- A list of references to the dataloaders that are
            being iterated over
        - **num_steps** (`int`) -- The number of steps to accumulate over
        - **adjust_scheduler** (`bool`) -- Whether the scheduler should be adjusted to account for the gradient
            accumulation
        - **sync_with_dataloader** (`bool`) -- Whether the gradients should be synced at the end of the dataloader
            iteration and the number of total steps reset
        - **is_xla_gradients_synced** (`bool`) -- Whether the XLA gradients have been synchronized. It is initialized
          as false. Once gradients have been reduced before the optimizer step, this flag is set to true. Subsequently,
            after each step, the flag is reset to false. FSDP will always synchronize the gradients, hence
            is_xla_gradients_synced is always true.
    """

    _shared_state = SharedDict()

    def __init__(self, gradient_accumulation_plugin: GradientAccumulationPlugin | None = None):
        self.__dict__ = self._shared_state
        if not self.initialized:
            self.sync_gradients = True
            self._dataloader_references_ref = [None]
            self.plugin_kwargs = (
                gradient_accumulation_plugin.to_kwargs() if gradient_accumulation_plugin is not None else {}
            )
            self._is_xla_gradients_synced = False

        # Plugin args are different and can be updated
        if gradient_accumulation_plugin is not None and self.plugin_kwargs != gradient_accumulation_plugin.to_kwargs():
            self.plugin_kwargs = gradient_accumulation_plugin.to_kwargs()

    @property
    def num_steps(self) -> int:
        "Returns the number of steps to accumulate over"
        return self.plugin_kwargs.get("num_steps", 1)

    @property
    def adjust_scheduler(self) -> bool:
        "Returns whether the scheduler should be adjusted"
        return self.plugin_kwargs.get("adjust_scheduler", False)

    @property
    def sync_with_dataloader(self) -> bool:
        "Returns whether the gradients should be synced at the end of the dataloader iteration and the number of total steps reset"
        return self.plugin_kwargs.get("sync_with_dataloader", True)

    @property
    def initialized(self) -> bool:
        "Returns whether the `GradientState` has been initialized"
        return GradientState._shared_state != {}

    @property
    def end_of_dataloader(self) -> bool:
        "Returns whether we have reached the end of the current dataloader"
        if not self.in_dataloader:
            return False
        return self.active_dataloader.end_of_dataloader

    @property
    def remainder(self) -> int:
        "Returns the number of extra samples that were added from padding the dataloader"
        if not self.in_dataloader:
            return -1
        return self.active_dataloader.remainder

    def __repr__(self):
        return (
            f"Sync Gradients: {self.sync_gradients}\n"
            f"At end of current dataloader: {self.end_of_dataloader}\n"
            f"Extra samples added: {self.remainder}\n"
            f"Gradient accumulation plugin: {self.plugin_kwargs}\n"
        )

    @property
    def is_xla_gradients_synced(self):
        "Returns the value of is_xla_gradients_synced. FSDP will always synchronize the gradients, hence is_xla_gradients_synced is always true."
        if parse_flag_from_env("ACCELERATE_USE_FSDP", default=False):
            return True
        return self._is_xla_gradients_synced

    @is_xla_gradients_synced.setter
    def is_xla_gradients_synced(self, is_synced):
        "Set the _is_xla_gradients_synced attribute."
        self._is_xla_gradients_synced = is_synced

    def _set_sync_gradients(self, sync_gradients):
        "Private function that sets whether gradients should be synchronized. Users should not have to call this."
        self.sync_gradients = sync_gradients
        # Allow grad-sync to automatically work on TPUs
        if (
            self.sync_gradients
            and is_torch_xla_available(check_is_tpu=True)
            and PartialState().distributed_type == DistributedType.XLA
        ):
            xm.mark_step()

    def _add_dataloader(self, dataloader):
        "Private function that adds a dataloader to `self.dataloader_references` and sets `in_dataloader` to `True`. Users should not have to call this."
        # We explicitly use assignment to ensure that the property setter is triggered, which is required for garbage collection.
        # Avoid using self.dataloader_references.append as it will not trigger the setter.
        self.dataloader_references += [dataloader]

    def _remove_dataloader(self, dataloader):
        "Private function that removes a dataloader from `self.dataloader_references` and sets `in_dataloader` to `False` if there are no more dataloaders. Users should not have to call this."
        # We explicitly use assignment to ensure that the property setter is triggered.
        self.dataloader_references = [
            dataloader_ref for dataloader_ref in self.dataloader_references if dataloader_ref != dataloader
        ]

    @property
    def active_dataloader(self):
        return self.dataloader_references[-1]

    @property
    def dataloader_references(self):
        # We use a property getter and setter with weakrefs to avoid circular references that prevent garbage collection
        return [reference() if reference is not None else reference for reference in self._dataloader_references_ref]

    @dataloader_references.setter
    def dataloader_references(self, references):
        self._dataloader_references_ref = [
            weakref.ref(dataloader) if dataloader is not None else dataloader for dataloader in references
        ]

    @property
    def in_dataloader(self) -> bool:
        "Returns whether the current process is in a dataloader"
        return self.active_dataloader is not None

    @staticmethod
    def _reset_state():
        "Resets `_shared_state`, is used internally and should not be called"
        GradientState._shared_state.clear()


================================================
FILE: src/accelerate/test_utils/__init__.py
================================================
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .testing import (
    DEFAULT_LAUNCH_COMMAND,
    are_the_same_tensors,
    assert_exception,
    capture_call_output,
    device_count,
    execute_subprocess_async,
    get_launch_command,
    get_torch_dist_unique_port,
    memory_allocated_func,
    path_in_accelerate_package,
    pytest_xdist_worker_id,
    require_bnb,
    require_cpu,
    require_cuda,
    require_cuda_or_hpu,
    require_cuda_or_xpu,
    require_fp8,
    require_fp16,
    require_huggingface_suite,
    require_mlu,
    require_mps,
    require_multi_device,
    require_multi_gpu,
    require_multi_gpu_or_xpu,
    require_multi_xpu,
    require_musa,
    require_non_cpu,
    require_non_hpu,
    require_non_torch_xla,
    require_non_xpu,
    require_npu,
    require_pippy,
    require_sdaa,
    require_single_device,
    require_single_gpu,
    require_single_xpu,
    require_torch_min_version,
    require_torchao,
    require_torchvision,
    require_tpu,
    require_transformer_engine,
    require_transformer_engine_mxfp8,
    require_xpu,
    run_first,
    skip,
    slow,
    torch_device,
)
from .training import RegressionDataset, RegressionModel


from .scripts import test_script, test_sync, test_ops  # isort: skip


================================================
FILE: src/accelerate/test_utils/examples.py
================================================
#!/usr/bin/env python

# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
A collection of utilities for comparing `examples/complete_*_example.py` scripts with the capabilities inside of each
`examples/by_feature` example. `compare_against_test` is the main function that should be used when testing, while the
others are used to either get the code that matters, or to preprocess them (such as stripping comments)
"""

import os
from typing import Optional


def get_function_contents_by_name(lines: list[str], name: str):
    """
    Extracts a function from `lines` of segmented source code with the name `name`.

    Args:
        lines (`List[str]`):
            Source code of a script separated by line.
        name (`str`):
            The name of the function to extract. Should be either `training_function` or `main`
    """
    if name != "training_function" and name != "main":
        raise ValueError(f"Incorrect function name passed: {name}, choose either 'main' or 'training_function'")
    good_lines, found_start = [], False
    for line in lines:
        if not found_start and f"def {name}" in line:
            found_start = True
            good_lines.append(line)
            continue
        if found_start:
            if name == "training_function" and "def main" in line:
                return good_lines
            if name == "main" and "if __name__" in line:
                return good_lines
            good_lines.append(line)


def clean_lines(lines: list[str]):
    """
    Filters `lines` and removes any entries that start with a comment ('#') or is just a newline ('\n')

    Args:
        lines (`List[str]`):
            Source code of a script separated by line.
    """
    return [line for line in lines if not line.lstrip().startswith("#") and line != "\n"]


def compare_against_test(
    base_filename: str, feature_filename: str, parser_only: bool, secondary_filename: Optional[str] = None
):
    """
    Tests whether the additional code inside of `feature_filename` was implemented in `base_filename`. This should be
    used when testing to see if `complete_*_.py` examples have all of the implementations from each of the
    `examples/by_feature/*` scripts.

    It utilizes `nlp_example.py` to extract out all of the repeated training code, so that only the new additional code
    is examined and checked. If something *other* than `nlp_example.py` should be used, such as `cv_example.py` for the
    `complete_cv_example.py` script, it should be passed in for the `secondary_filename` parameter.

    Args:
        base_filename (`str` or `os.PathLike`):
            The filepath of a single "complete" example script to test, such as `examples/complete_cv_example.py`
        feature_filename (`str` or `os.PathLike`):
            The filepath of a single feature example script. The contents of this script are checked to see if they
            exist in `base_filename`
        parser_only (`bool`):
            Whether to compare only the `main()` sections in both files, or to compare the contents of
            `training_loop()`
        secondary_filename (`str`, *optional*):
            A potential secondary filepath that should be included in the check. This function extracts the base
            functionalities off of "examples/nlp_example.py", so if `base_filename` is a script other than
            `complete_nlp_example.py`, the template script should be included here. Such as `examples/cv_example.py`
    """
    with open(base_filename) as f:
        base_file_contents = f.readlines()
    with open(os.path.abspath(os.path.join("examples", "nlp_example.py"))) as f:
        full_file_contents = f.readlines()
    with open(feature_filename) as f:
        feature_file_contents = f.readlines()
    if secondary_filename is not None:
        with open(secondary_filename) as f:
            secondary_file_contents = f.readlines()

    # This is our base, we remove all the code from here in our `full_filename` and `feature_filename` to find the new content
    if parser_only:
        base_file_func = clean_lines(get_function_contents_by_name(base_file_contents, "main"))
        full_file_func = clean_lines(get_function_contents_by_name(full_file_contents, "main"))
        feature_file_func = clean_lines(get_function_contents_by_name(feature_file_contents, "main"))
        if secondary_filename is not None:
            secondary_file_func = clean_lines(get_function_contents_by_name(secondary_file_contents, "main"))
    else:
        base_file_func = clean_lines(get_function_contents_by_name(base_file_contents, "training_function"))
        full_file_func = clean_lines(get_function_contents_by_name(full_file_contents, "training_function"))
        feature_file_func = clean_lines(get_function_contents_by_name(feature_file_contents, "training_function"))
        if secondary_filename is not None:
            secondary_file_func = clean_lines(
                get_function_contents_by_name(secondary_file_contents, "training_function")
            )

    _dl_line = "train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)\n"

    # Specific code in our script that differs from the full version, aka what is new
    new_feature_code = []
    passed_idxs = []  # We keep track of the idxs just in case it's a repeated statement
    it = iter(feature_file_func)
    for i in range(len(feature_file_func) - 1):
        if i not in passed_idxs:
            line = next(it)
            if (line not in full_file_func) and (line.lstrip() != _dl_line):
                if "TESTING_MOCKED_DATALOADERS" not in line:
                    new_feature_code.append(line)
                    passed_idxs.append(i)
                else:
                    # Skip over the `config['num_epochs'] = 2` statement
                    _ = next(it)

    # Extract out just the new parts from the full_file_training_func
    new_full_example_parts = []
    passed_idxs = []  # We keep track of the idxs just in case it's a repeated statement
    for i, line in enumerate(base_file_func):
        if i not in passed_idxs:
            if (line not in full_file_func) and (line.lstrip() != _dl_line):
                if "TESTING_MOCKED_DATALOADERS" not in line:
                    new_full_example_parts.append(line)
                    passed_idxs.append(i)

    # Finally, get the overall diff
    diff_from_example = [line for line in new_feature_code if line not in new_full_example_parts]
    if secondary_filename is not None:
        diff_from_two = [line for line in full_file_contents if line not in secondary_file_func]
        diff_from_example = [line for line in diff_from_example if line not in diff_from_two]

    return diff_from_example


================================================
FILE: src/accelerate/test_utils/scripts/__init__.py
================================================
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: src/accelerate/test_utils/scripts/external_deps/__init__.py
================================================
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: src/accelerate/test_utils/scripts/external_deps/test_checkpointing.py
================================================
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import json
import os

import evaluate
import torch
from datasets import load_dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed

from accelerate import Accelerator, DistributedType
from accelerate.utils.deepspeed import DummyOptim, DummyScheduler


MAX_GPU_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 32


def get_dataloaders(accelerator: Accelerator, batch_size: int = 16, model_name: str = "bert-base-cased"):
    """
    Creates a set of `DataLoader`s for the `glue` dataset.

    Args:
        accelerator (`Accelerator`):
            An `Accelerator` object
        batch_size (`int`, *optional*):
            The batch size for the train and validation DataLoaders.
        model_name (`str`, *optional*):
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    datasets = load_dataset("glue", "mrpc")

    def tokenize_function(examples):
        # max_length=None => use the model max length (it's actually the default)
        outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
        return outputs

    # Apply the method we just defined to all the examples in all the splits of the dataset
    tokenized_datasets = datasets.map(
        tokenize_function, batched=True, remove_columns=["idx", "sentence1", "sentence2"], load_from_cache_file=False
    )

    # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
    # transformers library
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

    def collate_fn(examples):
        # On TPU it's best to pad everything to the same length or training will be very slow.
        if accelerator.distributed_type == DistributedType.XLA:
            return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
        return tokenizer.pad(examples, padding="longest", return_tensors="pt")

    # Instantiate dataloaders.
    train_dataloader = DataLoader(
        tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size
    )
    eval_dataloader = DataLoader(
        tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
    )

    return train_dataloader, eval_dataloader


def evaluation_loop(accelerator, model, eval_dataloader, metric):
    model.eval()
    samples_seen = 0
    for step, batch in enumerate(eval_dataloader):
        # We could avoid this line since we set the accelerator with `device_placement=True`.
        batch.to(accelerator.device)
        with torch.no_grad():
            outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        # It is slightly faster to call this once, than multiple times
        predictions, references = accelerator.gather(
            (predictions, batch["labels"])
        )  # If we are in a multiprocess environment, the last batch has duplicates
        if accelerator.use_distributed:
            if step == len(eval_dataloader) - 1:
                predictions = predictions[: len(eval_dataloader.dataset) - samples_seen]
                references = references[: len(eval_dataloader.dataset) - samples_seen]
            else:
                samples_seen += references.shape[0]
        metric.add_batch(
            predictions=predictions,
            references=references,
        )

    eval_metric = metric.compute()
    return eval_metric["accuracy"]


def training_function(config, args):
    # Initialize accelerator
    accelerator = Accelerator()

    # Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
    lr = config["lr"]
    num_epochs = int(config["num_epochs"])
    seed = int(config["seed"])
    batch_size = int(config["batch_size"])
    model_name = args.model_name_or_path

    set_seed(seed)
    train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size, model_name)

    # Instantiate the model (we build the model here so that the seed also control new weights initialization)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True)

    # Instantiate optimizer
    optimizer_cls = (
        AdamW
        if accelerator.state.deepspeed_plugin is None
        or "optimizer" not in accelerator.state.deepspeed_plugin.deepspeed_config
        else DummyOptim
    )
    optimizer = optimizer_cls(params=model.parameters(), lr=lr)

    if accelerator.state.deepspeed_plugin is not None:
        gradient_accumulation_steps = accelerator.state.deepspeed_plugin.deepspeed_config[
            "gradient_accumulation_steps"
        ]
    else:
        gradient_accumulation_steps = 1
    max_training_steps = (len(train_dataloader) * num_epochs) // gradient_accumulation_steps

    # Instantiate scheduler
    if (
        accelerator.state.deepspeed_plugin is None
        or "scheduler" not in accelerator.state.deepspeed_plugin.deepspeed_config
    ):
        lr_scheduler = get_linear_schedule_with_warmup(
            optimizer=optimizer,
            num_warmup_steps=0,
            num_training_steps=max_training_steps,
        )
    else:
        lr_scheduler = DummyScheduler(optimizer, total_num_steps=max_training_steps, warmup_num_steps=0)

    # Prepare everything
    # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
    # prepare method.
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
    )

    # We need to keep track of how many total steps we have iterated over
    overall_step = 0
    # We also need to keep track of the stating epoch so files are named properly
    starting_epoch = 0
    metric = evaluate.load("glue", "mrpc")
    ending_epoch = num_epochs

    if args.partial_train_epoch is not None:
        ending_epoch = args.partial_train_epoch

    if args.resume_from_checkpoint:
        accelerator.load_state(args.resume_from_checkpoint)
        epoch_string = args.resume_from_checkpoint.split("epoch_")[1]
        state_epoch_num = ""
        for char in epoch_string:
            if char.isdigit():
                state_epoch_num += char
            else:
                break
        starting_epoch = int(state_epoch_num) + 1
        accuracy = evaluation_loop(accelerator, model, eval_dataloader, metric)
        accelerator.print("resumed checkpoint performance:", accuracy)
        accelerator.print("resumed checkpoint's scheduler's lr:", lr_scheduler.get_lr()[0])
        accelerator.print("resumed optimizers's lr:", optimizer.param_groups[0]["lr"])
        with open(os.path.join(args.output_dir, f"state_{starting_epoch - 1}.json")) as f:
            resumed_state = json.load(f)
            assert resumed_state["accuracy"] == accuracy, "Accuracy mismatch, loading from checkpoint failed"
            assert resumed_state["lr"] == lr_scheduler.get_lr()[0], (
                "Scheduler learning rate mismatch, loading from checkpoint failed"
            )
            assert resumed_state["optimizer_lr"] == optimizer.param_groups[0]["lr"], (
                "Optimizer learning rate mismatch, loading from checkpoint failed"
            )
            assert resumed_state["epoch"] == starting_epoch - 1, "Epoch mismatch, loading from checkpoint failed"
            return

    # Now we train the model
    state = {}
    for epoch in range(starting_epoch, ending_epoch):
        model.train()
        for step, batch in enumerate(train_dataloader):
            outputs = model(**batch)
            loss = outputs.loss
            loss = loss / gradient_accumulation_steps
            accelerator.backward(loss)
            if step % gradient_accumulation_steps == 0:
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()

            overall_step += 1
        output_dir = f"epoch_{epoch}"
        output_dir = os.path.join(args.output_dir, output_dir)
        accelerator.save_state(output_dir)
        accuracy = evaluation_loop(accelerator, model, eval_dataloader, metric)
        state["accuracy"] = accuracy
        state["lr"] = lr_scheduler.get_lr()[0]
        state["optimizer_lr"] = optimizer.param_groups[0]["lr"]
        state["epoch"] = epoch
        state["step"] = overall_step
        accelerator.print(f"epoch {epoch}:", state)

        accelerator.wait_for_everyone()
        if accelerator.is_main_process:
            with open(os.path.join(args.output_dir, f"state_{epoch}.json"), "w") as f:
                json.dump(state, f)
    accelerator.end_training()


def main():
    parser = argparse.ArgumentParser(description="Simple example of training script tracking peak GPU memory usage.")
    parser.add_argument(
        "--model_name_or_path",
        type=str,
        default="bert-base-cased",
        help="Path to pretrained model or model identifier from huggingface.co/models.",
        required=False,
    )
    parser.add_argument(
        "--output_dir",
        type=str,
        default=".",
        help="Optional save directory where all checkpoint folders will be stored. Default is the current working directory.",
    )
    parser.add_argument(
        "--resume_from_checkpoint",
        type=str,
        default=None,
        help="If the training should continue from a checkpoint folder.",
    )
    parser.add_argument(
        "--partial_train_epoch",
        type=int,
        default=None,
        help="If passed, the training will stop after this number of epochs.",
    )
    parser.add_argument(
        "--num_epochs",
        type=int,
        default=2,
        help="Number of train epochs.",
    )
    args = parser.parse_args()
    config = {"lr": 2e-5, "num_epochs": args.num_epochs, "seed": 42, "batch_size": 16}

    training_function(config, args)


if __name__ == "__main__":
    main()


================================================
FILE: src/accelerate/test_utils/scripts/external_deps/test_ds_alst_ulysses_sp.py
================================================
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Test script for verifying ALST/Ulysses SP works
"""

import torch
from deepspeed.runtime.utils import move_to_device
from transformers import AutoModelForCausalLM, AutoTokenizer

from accelerate import Accelerator
from accelerate.utils import ParallelismConfig, set_seed
from accelerate.utils.dataclasses import DeepSpeedSequenceParallelConfig


set_seed(42)

world_size = 2
model_name = "hf-internal-testing/tiny-random-LlamaForCausalLM"

micro_batch_size = 1

parallelism_config = ParallelismConfig(
    sp_backend="deepspeed",
    sp_size=world_size,
    # dp_shard_size=1, # set if dp is wanted as well
    sp_handler=DeepSpeedSequenceParallelConfig(
        sp_seq_length=256,
        sp_seq_length_is_variable=True,
        sp_attn_implementation="sdpa",
    ),
)

accelerator = Accelerator(
    parallelism_config=parallelism_config,
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

samples = 4
seqlen = 32
input_ids = torch.arange(1, seqlen * samples + 1).view(-1, seqlen) + 100
position_ids = torch.arange(seqlen * samples).view(-1, seqlen)

ds = torch.utils.data.TensorDataset(input_ids, position_ids)


def collate_fn(batch):
    input_ids, position_ids = batch[0]
    return dict(
        input_ids=input_ids.unsqueeze(0),
        position_ids=position_ids.unsqueeze(0),
        labels=input_ids.unsqueeze(0),
    )


dl = torch.utils.data.DataLoader(ds, batch_size=micro_batch_size, collate_fn=collate_fn)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

rank = torch.distributed.get_rank()

if rank == 0:
    print(f"DL orig: {len(dl)} samples")

model, optimizer, dl = accelerator.prepare(model, optimizer, dl)

if rank == 0:
    print(f"DL w/ adapter: {len(dl)} samples")

sp_size = parallelism_config.sp_size if parallelism_config else 1
if sp_size > 1:
    from deepspeed.utils import groups

    sp_group = groups._get_sequence_parallel_group()
    sp_world_size = parallelism_config.sp_size

unwrapped_model = accelerator.unwrap_model(model)

# Normal training loop
for iter, batch in enumerate(dl):
    optimizer.zero_grad()

    if rank == 0:
        print(f"batch {iter}: seqlen: {len(batch['input_ids'][0])}")
    batch = move_to_device(batch, model.device)
    outputs = model(**batch)

    shift_labels = batch["shift_labels"]
    loss = unwrapped_model.loss_function(
        logits=outputs.logits,
        labels=None,
        shift_labels=shift_labels,
        vocab_size=unwrapped_model.config.vocab_size,
    )

    if sp_size > 1:
        # differentiable weighted per-shard-loss aggregation across ranks
        losses_per_rank = torch.distributed.nn.functional.all_gather(loss, group=sp_group)
        # special dealing with SFT that has prompt tokens that aren't used in loss computation
        good_tokens = (shift_labels != -100).view(-1).sum()
        good_tokens_per_rank = torch.distributed.nn.functional.all_gather(good_tokens, group=sp_group)
        total_loss = sum(
            losses_per_rank[rank] * good_tokens_per_rank[rank]
            for rank in range(sp_world_size)
            if good_tokens_per_rank[rank] > 0
        )
        total_good_tokens = sum(good_tokens_per_rank)
        loss = total_loss / max(total_good_tokens, 1)

    if rank == 0:
        accelerator.print(f"{iter}: {loss=}")
    accelerator.log(dict(train_loss=loss, step=iter))

    accelerator.backward(loss)
    optimizer.step()

accelerator.end_training()


================================================
FILE: src/accelerate/test_utils/scripts/external_deps/test_ds_multiple_model.py
================================================
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Test script for verifying multiple models can be utilized with Accelerate + DeepSpeed:

Scenario 1: One model is training, another model is being used for inference/logits to impact training in some form.
Scenario 2: Two models are training simultaneously, which means two optimizers, etc.
"""

import argparse
from pathlib import Path

import evaluate
import torch
from datasets import load_dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup

from accelerate import Accelerator, DeepSpeedPlugin, DistributedType
from accelerate.state import AcceleratorState
from accelerate.utils.deepspeed import get_active_deepspeed_plugin


EVAL_BATCH_SIZE = 16


class NoiseModel(torch.nn.Module):
    def __init__(self, noise_factor=0.1):
        super().__init__()
        self.noise_factor = torch.nn.Parameter(torch.tensor(noise_factor, dtype=torch.float32))

    def forward(self, loss):
        return loss * self.noise_factor


def get_dataloaders(accelerator: Accelerator, batch_size: int = 16, model_name: str = "bert-base-cased"):
    """
    Creates a set of `DataLoader`s for the `glue` dataset.

    Args:
        accelerator (`Accelerator`):
            An `Accelerator` object
        batch_size (`int`, *optional*):
            The batch size for the train and validation DataLoaders.
        model_name (`str`, *optional*):
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    datasets = load_dataset("glue", "mrpc")

    def tokenize_function(examples):
        # max_length=None => use the model max length (it's actually the default)
        outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
        return outputs

    # Apply the method we just defined to all the examples in all the splits of the dataset
    tokenized_datasets = datasets.map(
        tokenize_function, batched=True, remove_columns=["idx", "sentence1", "sentence2"], load_from_cache_file=False
    )

    # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
    # transformers library
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

    def collate_fn(examples):
        # On TPU it's best to pad everything to the same length or training will be very slow.
        if accelerator.distributed_type == DistributedType.XLA:
            return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
        return tokenizer.pad(examples, padding="longest", return_tensors="pt")

    # Instantiate dataloaders.
    train_dataloader = DataLoader(
        tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size
    )
    eval_dataloader = DataLoader(
        tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
    )

    return train_dataloader, eval_dataloader


test_file_path = __file__
path = Path(test_file_path).resolve()
test_file_dir_str = str(path.parent.parent.parent.parent.parent.parent)

# Create our DS plugins
# We use custom schedulers and optimizers, hence `model_only`
ds_config_file = dict(
    zero2=f"{test_file_dir_str}/tests/deepspeed/ds_config_zero2_model_only.json",
    zero3=f"{test_file_dir_str}/tests/deepspeed/ds_config_zero3_model_only.json",
)


def single_model_training(config, args):
    # Training a single model, we have a `noise` model that is untrainable used to inject some noise into the training process
    num_epochs = config["num_epochs"]
    zero2_plugin = DeepSpeedPlugin(hf_ds_config=ds_config_file["zero2"])
    zero3_plugin = DeepSpeedPlugin(hf_ds_config=ds_config_file["zero3"])

    deepspeed_plugins = {"training": zero2_plugin, "inference": zero3_plugin}

    # Initialize accelerator
    accelerator = Accelerator(
        deepspeed_plugins=deepspeed_plugins,
        mixed_precision="bf16",
    )

    # Initialize model under zero2 plugin
    assert get_active_deepspeed_plugin(accelerator.state) is zero2_plugin
    train_model = AutoModelForSequenceClassification.from_pretrained(args.model_name_or_path)
    train_dataloader, eval_dataloader = get_dataloaders(
        accelerator, batch_size=config["batch_size"], model_name=args.model_name_or_path
    )
    max_training_steps = len(train_dataloader) * config["num_epochs"]
    optimizer = AdamW(train_model.parameters(), lr=config["lr"])
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=max_training_steps
    )

    train_dataloader, eval_dataloader, train_model, optimizer, lr_scheduler = accelerator.prepare(
        train_dataloader, eval_dataloader, train_model, optimizer, lr_scheduler
    )

    # Now prepare the model under zero3 plugin
    accelerator.state.select_deepspeed_plugin("inference")
    assert get_active_deepspeed_plugin(accelerator.state) is zero3_plugin
    inference_model = NoiseModel()
    inference_model = accelerator.prepare(inference_model)
    inference_model.eval()

    # Run training loop
    accelerator.state.select_deepspeed_plugin("training")
    # We also need to keep track of the stating epoch so files are named properly
    starting_epoch = 0

    # Now we train the model
    best_performance = 0
    metric = evaluate.load("glue", "mrpc")
    performance_metric = {}
    for epoch in range(starting_epoch, num_epochs):
        train_model.train()
        inference_model.train()
        for step, batch in enumerate(train_dataloader):
            with accelerator.accumulate(train_model):
                outputs_1 = train_model(**batch)
                with torch.no_grad():
                    outputs_2 = inference_model(outputs_1.loss)
                # Combine the losses
                loss = outputs_1.loss + outputs_2
                accelerator.backward(loss)
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()

        train_model.eval()
        for step, batch in enumerate(eval_dataloader):
            with torch.no_grad():
                outputs = train_model(**batch)
            predictions = outputs.logits.argmax(dim=-1)
            # It is slightly faster to call this once, than multiple times
            predictions, references = accelerator.gather_for_metrics((predictions, batch["labels"]))
            metric.add_batch(
                predictions=predictions,
                references=references,
            )

        eval_metric = metric.compute()
        # Use accelerator.print to print only on the main process.
        accelerator.print(f"epoch {epoch}:", eval_metric)
        performance_metric[f"epoch-{epoch}"] = eval_metric["accuracy"]

        if best_performance < eval_metric["accuracy"]:
            best_performance = eval_metric["accuracy"]
    assert best_performance > performance_metric["epoch-0"]


def multiple_model_training(config, args):
    # This will essentially be like a k-fold model, but one model is Zero-2 and another model is Zero-3
    num_epochs = config["num_epochs"]
    zero2_plugin = DeepSpeedPlugin(hf_ds_config=ds_config_file["zero2"])
    zero3_plugin = DeepSpeedPlugin(hf_ds_config=ds_config_file["zero3"])

    deepspeed_plugins = {"zero2": zero2_plugin, "zero3": zero3_plugin}

    # Initialize accelerator
    zero2_accelerator = Accelerator(
        deepspeed_plugins=deepspeed_plugins,
        mixed_precision="bf16",
    )

    # Since an `AcceleratorState` has already been made, we can just reuse it here
    zero3_accelerator = Accelerator()

    # Initialize model under zero2 plugin
    assert get_active_deepspeed_plugin(zero2_accelerator.state) is zero2_plugin
    zero2_model = AutoModelForSequenceClassification.from_pretrained(args.model_name_or_path)
    train_dataloader, eval_dataloader = get_dataloaders(
        zero2_accelerator, batch_size=config["batch_size"], model_name=args.model_name_or_path
    )
    max_training_steps = len(train_dataloader) * config["num_epochs"]
    zero2_optimizer = AdamW(zero2_model.parameters(), lr=config["lr"])
    zero2_lr_scheduler = get_linear_schedule_with_warmup(
        zero2_optimizer, num_warmup_steps=0, num_training_steps=max_training_steps
    )

    train_dataloader, eval_dataloader, zero2_model, zero2_optimizer, zero2_lr_scheduler = zero2_accelerator.prepare(
        train_dataloader, eval_dataloader, zero2_model, zero2_optimizer, zero2_lr_scheduler
    )
    assert zero2_accelerator.deepspeed_engine_wrapped.engine is zero2_model

    # now do Zero3
    zero3_accelerator.state.select_deepspeed_plugin("zero3")
    zero3_plugin.deepspeed_config["train_micro_batch_size_per_gpu"] = zero2_plugin.deepspeed_config[
        "train_micro_batch_size_per_gpu"
    ]
    assert get_active_deepspeed_plugin(zero3_accelerator.state) is zero3_plugin
    zero3_model = AutoModelForSequenceClassification.from_pretrained(args.model_name_or_path)
    zero3_optimizer = AdamW(zero3_model.parameters(), lr=config["lr"])
    zero3_lr_scheduler = get_linear_schedule_with_warmup(
        zero3_optimizer, num_warmup_steps=0, num_training_steps=max_training_steps
    )
    zero3_model, zero3_optimizer, zero3_lr_scheduler = zero3_accelerator.prepare(
        zero3_model, zero3_optimizer, zero3_lr_scheduler
    )
    assert zero3_accelerator.deepspeed_engine_wrapped.engine is zero3_model

    # Run training loop
    starting_epoch = 0

    # Now we train the model
    best_performance_a = 0
    best_performance_b = 0
    metric_a = evaluate.load("glue", "mrpc")
    metric_b = evaluate.load("glue", "mrpc")
    performance_metric_a = {}
    performance_metric_b = {}
    for epoch in range(starting_epoch, num_epochs):
        zero2_model.train()
        zero3_model.train()
        for step, batch in enumerate(train_dataloader):
            with zero2_accelerator.accumulate(zero2_model, zero3_model):
                outputs_1 = zero2_model(**batch)
                zero2_accelerator.backward(outputs_1.loss)
                zero2_optimizer.step()
                zero2_lr_scheduler.step()
                zero2_optimizer.zero_grad()
                outputs_2 = zero3_model(**batch)
                zero3_accelerator.backward(outputs_2.loss)
                zero3_optimizer.step()
                zero3_lr_scheduler.step()
                zero3_optimizer.zero_grad()

        zero2_model.eval()
        zero3_model.eval()
        for step, batch in enumerate(eval_dataloader):
            with torch.no_grad():
                logits_a = zero2_model(**batch).logits
                logits_b = zero3_model(**batch).logits
            # Combine the logits from both models
            predictions_a = logits_a.argmax(dim=-1)
            predictions_b = logits_b.argmax(dim=-1)
            # It is slightly faster to call this once, than multiple times
            predictions_a, predictions_b, references = zero2_accelerator.gather_for_metrics(
                (predictions_a, predictions_b, batch["labels"])
            )
            metric_a.add_batch(
                predictions=predictions_a,
                references=references,
            )
            metric_b.add_batch(
                predictions=predictions_b,
                references=references,
            )

        eval_metric_a = metric_a.compute()
        eval_metric_b = metric_b.compute()
        # Use accelerator.print to print only on the main process.
        zero2_accelerator.print(f"epoch {epoch}:", eval_metric_a, eval_metric_b)
        performance_metric_a[f"epoch-{epoch}"] = eval_metric_a["accuracy"]
        performance_metric_b[f"epoch-{epoch}"] = eval_metric_b["accuracy"]

        if best_performance_a < eval_metric_a["accuracy"]:
            best_performance_a = eval_metric_a["accuracy"]
        if best_performance_b < eval_metric_b["accuracy"]:
            best_performance_b = eval_metric_b["accuracy"]
    assert best_performance_a > performance_metric_a["epoch-0"]
    assert best_performance_b > performance_metric_b["epoch-0"]


def main():
    parser = argparse.ArgumentParser(description="Simple example of training script tracking peak GPU memory usage.")
    parser.add_argument(
        "--model_name_or_path",
        type=str,
        default="bert-base-cased",
        help="Path to pretrained model or model identifier from huggingface.co/models.",
        required=False,
    )
    parser.add_argument(
        "--performance_lower_bound",
        type=float,
        default=None,
        help="Optional lower bound for the performance metric. If set, the training will throw error when the performance metric drops below this value.",
    )
    parser.add_argument(
        "--num_epochs",
        type=int,
        default=3,
        help="Number of train epochs.",
    )
    args = parser.parse_args()
    config = {"lr": 2e-5, "num_epochs": args.num_epochs, "seed": 42, "batch_size": 8}
    single_model_training(config, args)
    AcceleratorState._reset_state(True)
    multiple_model_training(config, args)


if __name__ == "__main__":
    main()


================================================
FILE: src/accelerate/test_utils/scripts/external_deps/test_metrics.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import math
import os
from copy import deepcopy

import datasets
import evaluate
import torch
import transformers
from datasets import load_dataset
from torch.utils.data import DataLoader, IterableDataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer

from accelerate import Accelerator, DataLoaderConfiguration, DistributedType
from accelerate.data_loader import DataLoaderDispatcher
from accelerate.test_utils import RegressionDataset, RegressionModel, torch_device
from accelerate.utils import is_torch_xla_available, set_seed


os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"


class ListHandler(logging.Handler):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.logs = []

    def emit(self, record):
        self.logs.append(record)


def get_basic_setup(accelerator, num_samples=82, batch_size=16):
    "Returns everything needed to perform basic training"
    set_seed(42)
    model = RegressionModel()
    ddp_model = deepcopy(model)
    dset = RegressionDataset(length=num_samples)
    dataloader = DataLoader(dset, batch_size=batch_size)
    model.to(accelerator.device)
    ddp_model, dataloader = accelerator.prepare(ddp_model, dataloader)
    return model, ddp_model, dataloader


def get_dataloader(accelerator: Accelerator, use_longest=False):
    tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/mrpc-bert-base-cased")
    dataset = load_dataset("glue", "mrpc", split="validation")

    def tokenize_function(examples):
        outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
        return outputs

    with accelerator.main_process_first():
        tokenized_datasets = dataset.map(
            tokenize_function,
            batched=True,
            remove_columns=["idx", "sentence1", "sentence2"],
        )

    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

    def collate_fn(examples):
        if use_longest:
            return tokenizer.pad(examples, padding="longest", return_tensors="pt")
        return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")

    return DataLoader(tokenized_datasets, shuffle=False, collate_fn=collate_fn, batch_size=16)


def get_mrpc_setup(dispatch_batches, split_batches):
    dataloader_config = DataLoaderConfiguration(dispatch_batches=dispatch_batches, split_batches=split_batches)
    accelerator = Accelerator(dataloader_config=dataloader_config)
    dataloader = get_dataloader(accelerator, not dispatch_batches)
    model = AutoModelForSequenceClassification.from_pretrained(
        "hf-internal-testing/mrpc-bert-base-cased", return_dict=True
    )
    ddp_model, ddp_dataloader = accelerator.prepare(model, dataloader)
    return {
        "ddp": [ddp_model, ddp_dataloader, torch_device],
        "no": [model, dataloader, accelerator.device],
    }, accelerator


def generate_predictions(model, dataloader, accelerator):
    logits_and_targets = []
    for batch in dataloader:
        input, target = batch.values()
        with torch.no_grad():
            logit = model(input)
            logit, target = accelerator.gather_for_metrics((logit, target))
            logits_and_targets.append((logit, target))
    logits, targs = [], []
    for logit, targ in logits_and_targets:
        logits.append(logit)
        targs.append(targ)
    logits, targs = torch.cat(logits), torch.cat(targs)
    return logits, targs


def test_torch_metrics(
    accelerator: Accelerator, num_samples=82, dispatch_batches=False, split_batches=False, batch_size=16
):
    _, ddp_model, dataloader = get_basic_setup(accelerator, num_samples, batch_size)
    logits, _ = generate_predictions(ddp_model, dataloader, accelerator)
    assert len(logits) == num_samples, (
        f"Unexpected number of inputs:\n    Expected: {num_samples}\n    Actual: {len(logits)}"
    )


def test_mrpc(dispatch_batches: bool = False, split_batches: bool = False):
    metric = evaluate.load("glue", "mrpc")
    setup, accelerator = get_mrpc_setup(dispatch_batches, split_batches)
    # First do baseline
    model, dataloader, device = setup["no"]
    model.to(device)
    model.eval()
    for batch in dataloader:
        batch.to(device)
        with torch.inference_mode():
            outputs = model(**batch)
        preds = outputs.logits.argmax(dim=-1)
        metric.add_batch(predictions=preds, references=batch["labels"])
    baseline = metric.compute()

    # Then do distributed
    model, dataloader, device = setup["ddp"]
    model.eval()
    for batch in dataloader:
        with torch.inference_mode():
            outputs = model(**batch)
        preds = outputs.logits.argmax(dim=-1)
        references = batch["labels"]
        preds, references = accelerator.gather_for_metrics((preds, references))
        metric.add_batch(predictions=preds, references=references)
    distributed = metric.compute()

    for key in "accuracy f1".split():
        assert math.isclose(baseline[key], distributed[key]), (
            f"Baseline and Distributed are not the same for key {key}:\n\tBaseline: {baseline[key]}\n\tDistributed: {distributed[key]}\n"
        )


def test_gather_for_metrics_with_non_tensor_objects_iterable_dataset():
    class DummyIterableDataset(IterableDataset):
        def __init__(self, data):
            self.data = data

        def __len__(self):
            return len(self.data)

        def __iter__(self):
            yield from self.data

    iterable_dataset = DummyIterableDataset([n for n in range(30)])
    dataloader = DataLoader(iterable_dataset, batch_size=4)
    accelerator = Accelerator()
    prepared_dataloader = accelerator.prepare(dataloader)

    if accelerator.is_main_process:
        logger = logging.root.manager.loggerDict["accelerate.accelerator"]
        list_handler = ListHandler()
        logger.addHandler(list_handler)

    batches_for_metrics = []
    for batch in prepared_dataloader:
        batches_for_metrics.append(accelerator.gather_for_metrics(batch))

    assert torch.cat(batches_for_metrics).size(0) == 30

    if accelerator.is_main_process:
        assert len(list_handler.logs) == 0
        logger.removeHandler(list_handler)


def test_gather_for_metrics_with_iterable_dataset():
    class DummyIterableDataset(IterableDataset):
        def __init__(self, data):
            self.data = data

        def __len__(self):
            return len(self.data)

        def __iter__(self):
            yield from self.data

    iterable_dataset = DummyIterableDataset(torch.as_tensor(range(30)))
    dataloader = DataLoader(iterable_dataset, batch_size=4)

    accelerator = Accelerator()
    prepared_dataloader = accelerator.prepare(dataloader)

    assert isinstance(prepared_dataloader, DataLoaderDispatcher)

    if accelerator.is_main_process:
        logger = logging.root.manager.loggerDict["accelerate.accelerator"]
        list_handler = ListHandler()
        logger.addHandler(list_handler)

    batches_for_metrics = []
    for batch in prepared_dataloader:
        batches_for_metrics.append(accelerator.gather_for_metrics(batch))

    assert torch.cat(batches_for_metrics).size(0) == 30

    if accelerator.is_main_process:
        assert len(list_handler.logs) == 0

        logger.removeHandler(list_handler)


def test_gather_for_metrics_drop_last():
    accelerator = Accelerator()
    per_device_batch_size = 5
    num_items = (10 * accelerator.num_processes) + 1
    dataloader = DataLoader(range(num_items), batch_size=per_device_batch_size, drop_last=True)
    dataloader = accelerator.prepare(dataloader)

    iterator = iter(dataloader)
    next(iterator)  # Skip first batch tensor([0, 1, 2, 3, 4], device='cuda:0')
    batch = next(iterator)
    gathered_items = accelerator.gather_for_metrics(batch)

    # Should return a full set of complete batches from each GPU
    num_expected_items = per_device_batch_size * accelerator.num_processes
    assert gathered_items.size(0) == (num_expected_items), (
        f"Expected number of items: {num_expected_items}, Actual: {gathered_items.size(0)}"
    )


def main():
    dataloader_config = DataLoaderConfiguration(split_batches=False, dispatch_batches=False)
    accelerator = Accelerator(dataloader_config=dataloader_config)
    if accelerator.is_local_main_process:
        datasets.utils.logging.set_verbosity_warning()
        transformers.utils.logging.set_verbosity_warning()
    else:
        datasets.utils.logging.set_verbosity_error()
        transformers.utils.logging.set_verbosity_error()
    # TorchXLA does not support batch dispatching. 'put_on_device' is always False for
    # TorchXLA, which can cause a value error in 'prepare_data_loader' function.
    dispatch_batches_options = [False] if accelerator.state.distributed_type == DistributedType.XLA else [True, False]

    # Temporarily close this test for TorchXLA due to the 'Cannot set version_counter for
    # inference tensor' error in inference mode. Reopen it after TorchXLA fixes this bug.
    # These are a bit slower so they should only be ran on the GPU or TPU
    if accelerator.device.type != "cpu" and not is_torch_xla_available():
        if accelerator.is_local_main_process:
            print("**Testing gather_for_metrics**")
        for split_batches in [True, False]:
            for dispatch_batches in dispatch_batches_options:
                if accelerator.is_local_main_process:
                    print(f"With: `split_batches={split_batches}`, `dispatch_batches={dispatch_batches}`")
                test_mrpc(dispatch_batches, split_batches)
                accelerator.state._reset_state()
        print("test_gather_for_metrics_with_iterable_dataset")
        test_gather_for_metrics_with_iterable_dataset()
        print("test gather_for_metrics_with_non_tensor_objects_iterable_dataset")
        test_gather_for_metrics_with_non_tensor_objects_iterable_dataset()

    # MpDeviceLoader in TorchXLA is an asynchronous loader that preloads several batches into cache.
    # This can cause the 'end_of_dataloader' of DataLoaderStateMixin to be set earlier than intended.
    # Skip this test when TorchXLA is enabled.
    if accelerator.state.distributed_type != DistributedType.XLA:
        if accelerator.is_local_main_process:
            print("**Test torch metrics**")
        for split_batches in [True, False]:
            for dispatch_batches in dispatch_batches_options:
                dataloader_config = DataLoaderConfiguration(
                    split_batches=split_batches, dispatch_batches=dispatch_batches
                )
                accelerator = Accelerator(dataloader_config=dataloader_config)
                if accelerator.is_local_main_process:
                    print(f"With: `split_batches={split_batches}`, `dispatch_batches={dispatch_batches}`, length=99")
                test_torch_metrics(accelerator, 99)
                accelerator.state._reset_state()
    if accelerator.is_local_main_process:
        print("**Test last batch is not dropped when perfectly divisible**")
    accelerator = Accelerator()
    test_torch_metrics(accelerator, 512)
    accelerator.state._reset_state()
    if accelerator.is_local_main_process:
        print("**Test that `drop_last` is taken into account**")
    test_gather_for_metrics_drop_last()
    accelerator.end_training()
    accelerator.state._reset_state()


def _mp_fn(index):
    # For xla_spawn (TPUs)
    main()


if __name__ == "__main__":
    main()


================================================
FILE: src/accelerate/test_utils/scripts/external_deps/test_peak_memory_usage.py
================================================
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import gc
import json
import os

import torch
from datasets import load_dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed

from accelerate import Accelerator, DistributedType
from accelerate.utils import (
    is_hpu_available,
    is_mlu_available,
    is_musa_available,
    is_neuron_available,
    is_npu_available,
    is_sdaa_available,
    is_xpu_available,
)
from accelerate.utils.deepspeed import DummyOptim, DummyScheduler


MAX_GPU_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 32


# Converting Bytes to Megabytes
def b2mb(x):
    return int(x / 2**20)


# This context manager is used to track the peak memory usage of the process
class TorchTracemalloc:
    def __enter__(self):
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            torch.cuda.reset_max_memory_allocated()  # reset the peak gauge to zero
            self.begin = torch.cuda.memory_allocated()
        elif is_mlu_available():
            torch.mlu.empty_cache()
            torch.mlu.reset_max_memory_allocated()  # reset the peak gauge to zero
            self.begin = torch.mlu.memory_allocated()
        elif is_sdaa_available():
            torch.sdaa.empty_cache()
            torch.sdaa.reset_max_memory_allocated()  # reset the peak gauge to zero
            self.begin = torch.sdaa.memory_allocated()
        elif is_musa_available():
            torch.musa.empty_cache()
            torch.musa.reset_max_memory_allocated()  # reset the peak gauge to zero
            self.begin = torch.musa.memory_allocated()
        elif is_npu_available():
            torch.npu.empty_cache()
            torch.npu.reset_max_memory_allocated()  # reset the peak gauge to zero
            self.begin = torch.npu.memory_allocated()
        elif is_xpu_available():
            torch.xpu.empty_cache()
            torch.xpu.reset_peak_memory_stats()  # reset the peak gauge to zero
            self.begin = torch.xpu.memory_allocated()
        elif is_hpu_available():
            # torch.hpu.empty_cache() # not available on hpu as it reserves all device memory for the current process
            torch.hpu.reset_peak_memory_stats()  # reset the peak gauge to zero
            self.begin = torch.hpu.memory_allocated()
        elif is_neuron_available():
            torch.neuron.empty_cache()
            torch.neuron.reset_peak_memory_stats()  # reset the peak gauge to zero
            self.begin = torch.neuron.memory_allocated()
        return self

    def __exit__(self, *exc):
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            self.end = torch.cuda.memory_allocated()
            self.peak = torch.cuda.max_memory_allocated()
        elif is_mlu_available():
            torch.mlu.empty_cache()
            self.end = torch.mlu.memory_allocated()
            self.begin = torch.mlu.max_memory_allocated()
        elif is_sdaa_available():
            torch.sdaa.empty_cache()
            self.end = torch.sdaa.memory_allocated()
            self.begin = torch.sdaa.max_memory_allocated()
        elif is_musa_available():
            torch.musa.empty_cache()
            self.end = torch.musa.memory_allocated()
            self.begin = torch.musa.max_memory_allocated()
        elif is_npu_available():
            torch.npu.empty_cache()
            self.end = torch.npu.memory_allocated()
            self.peak = torch.npu.max_memory_allocated()
        elif is_xpu_available():
            torch.xpu.empty_cache()
            self.end = torch.xpu.memory_allocated()
            self.peak = torch.xpu.max_memory_allocated()
        elif is_hpu_available():
            # torch.hpu.empty_cache() # not available on hpu as it reserves all device memory for the current process
            self.end = torch.hpu.memory_allocated()
            self.peak = torch.hpu.max_memory_allocated()
        elif is_neuron_available():
            torch.neuron.empty_cache()
            self.end = torch.neuron.memory_allocated()
            self.peak = torch.neuron.max_memory_allocated()
        self.used = b2mb(self.end - self.begin)
        self.peaked = b2mb(self.peak - self.begin)
        # print(f"delta used/peak {self.used:4d}/{self.peaked:4d}")


def get_dataloaders(
    accelerator: Accelerator,
    batch_size: int = 16,
    model_name: str = "bert-base-cased",
    n_train: int = 320,
    n_val: int = 160,
):
    """
    Creates a set of `DataLoader`s for the `glue` dataset.

    Args:
        accelerator (`Accelerator`):
            An `Accelerator` object
        batch_size (`int`, *optional*):
            The batch size for the train and validation DataLoaders.
        model_name (`str`, *optional*):
            The name of the model to use.
        n_train (`int`, *optional*):
            The number of training examples to use.
        n_val (`int`, *optional*):
            The number of validation examples to use.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    datasets = load_dataset(
        "glue", "mrpc", split={"train": f"train[:{n_train}]", "validation": f"validation[:{n_val}]"}
    )

    def tokenize_function(examples):
        # max_length=None => use the model max length (it's actually the default)
        outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
        return outputs

    # Apply the method we just defined to all the examples in all the splits of the dataset
    tokenized_datasets = datasets.map(
        tokenize_function, batched=True, remove_columns=["idx", "sentence1", "sentence2"], load_from_cache_file=False
    )

    # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
    # transformers library
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

    def collate_fn(examples):
        # On TPU it's best to pad everything to the same length or training will be very slow.
        if accelerator.distributed_type == DistributedType.XLA:
            return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
        return tokenizer.pad(examples, padding="longest", return_tensors="pt")

    # Instantiate dataloaders.
    train_dataloader = DataLoader(
        tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size
    )
    eval_dataloader = DataLoader(
        tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
    )

    return train_dataloader, eval_dataloader


def training_function(config, args):
    # Initialize accelerator
    accelerator = Accelerator()

    # Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
    lr = config["lr"]
    num_epochs = int(config["num_epochs"])
    seed = int(config["seed"])
    batch_size = int(config["batch_size"])
    model_name = args.model_name_or_path

    set_seed(seed)
    train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size, model_name, args.n_train, args.n_val)

    # Instantiate the model (we build the model here so that the seed also control new weights initialization)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True)

    # Instantiate optimizer
    optimizer_cls = (
        AdamW
        if accelerator.state.deepspeed_plugin is None
        or "optimizer" not in accelerator.state.deepspeed_plugin.deepspeed_config
        else DummyOptim
    )
    optimizer = optimizer_cls(params=model.parameters(), lr=lr)

    if accelerator.state.deepspeed_plugin is not None:
        gradient_accumulation_steps = accelerator.state.deepspeed_plugin.deepspeed_config[
            "gradient_accumulation_steps"
        ]
    else:
        gradient_accumulation_steps = 1
    max_training_steps = (len(train_dataloader) * num_epochs) // gradient_accumulation_steps

    # Instantiate scheduler
    if (
        accelerator.state.deepspeed_plugin is None
        or "scheduler" not in accelerator.state.deepspeed_plugin.deepspeed_config
    ):
        lr_scheduler = get_linear_schedule_with_warmup(
            optimizer=optimizer,
            num_warmup_steps=0,
            num_training_steps=max_training_steps,
        )
    else:
        lr_scheduler = DummyScheduler(optimizer, total_num_steps=max_training_steps, warmup_num_steps=0)

    # Prepare everything
    # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
    # prepare method.
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
    )

    # We need to keep track of how many total steps we have iterated over
    overall_step = 0
    # We also need to keep track of the stating epoch so files are named properly
    starting_epoch = 0

    # Now we train the model
    train_total_peak_memory = {}
    for epoch in range(starting_epoch, num_epochs):
        with TorchTracemalloc() as tracemalloc:
            model.train()
            for step, batch in enumerate(train_dataloader):
                outputs = model(**batch)
                loss = outputs.loss
                loss = loss / gradient_accumulation_steps
                accelerator.backward(loss)
                if step % gradient_accumulation_steps == 0:
                    optimizer.step()
                    lr_scheduler.step()
                    optimizer.zero_grad()

                overall_step += 1

        # Printing the GPU memory usage details such as allocated memory, peak memory, and total memory usage
        accelerator.print(f"Memory before entering the train : {b2mb(tracemalloc.begin)}")
        accelerator.print(f"Memory consumed at the end of the train (end-begin): {tracemalloc.used}")
        accelerator.print(f"Peak Memory consumed during the train (max-begin): {tracemalloc.peaked}")
        accelerator.print(
            f"Total Peak Memory consumed during the train (max): {tracemalloc.peaked + b2mb(tracemalloc.begin)}"
        )
        train_total_peak_memory[f"epoch-{epoch}"] = tracemalloc.peaked + b2mb(tracemalloc.begin)
        if args.peak_memory_upper_bound is not None:
            assert train_total_peak_memory[f"epoch-{epoch}"] <= args.peak_memory_upper_bound, (
                "Peak memory usage exceeded the upper bound"
            )

    accelerator.wait_for_everyone()
    if accelerator.is_main_process:
        with open(os.path.join(args.output_dir, "peak_memory_utilization.json"), "w") as f:
            json.dump(train_total_peak_memory, f)
    accelerator.end_training()


def main():
    parser = argparse.ArgumentParser(description="Simple example of training script tracking peak GPU memory usage.")
    parser.add_argument(
        "--model_name_or_path",
        type=str,
        default="bert-base-cased",
        help="Path to pretrained model or model identifier from huggingface.co/models.",
        required=False,
    )
    parser.add_argument(
        "--output_dir",
        type=str,
        default=".",
        help="Optional save directory where all checkpoint folders will be stored. Default is the current working directory.",
    )
    parser.add_argument(
        "--peak_memory_upper_bound",
        type=float,
        default=None,
        help="The upper bound of peak memory usage in MB. If set, the training will throw an error if the peak memory usage exceeds this value.",
    )
    parser.add_argument(
        "--n_train",
        type=int,
        default=320,
        help="Number of training examples to use.",
    )
    parser.add_argument(
        "--n_val",
        type=int,
        default=160,
        help="Number of validation examples to use.",
    )
    parser.add_argument(
        "--num_epochs",
        type=int,
        default=1,
        help="Number of train epochs.",
    )
    args = parser.parse_args()
    config = {"lr": 2e-5, "num_epochs": args.num_epochs, "seed": 42, "batch_size": 16}
    training_function(config, args)


if __name__ == "__main__":
    main()


================================================
FILE: src/accelerate/test_utils/scripts/external_deps/test_performance.py
================================================
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import json
import os
from contextlib import nullcontext
from pathlib import Path

import evaluate
import torch
from datasets import load_dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup

from accelerate import Accelerator, DistributedType
from accelerate.parallelism_config import ParallelismConfig
from accelerate.utils import SAFE_WEIGHTS_NAME, set_seed
from accelerate.utils.deepspeed import DummyOptim, DummyScheduler


MAX_GPU_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 32


def get_dataloaders(accelerator: Accelerator, batch_size: int = 16, model_name: str = "bert-base-cased"):
    """
    Creates a set of `DataLoader`s for the `glue` dataset.

    Args:
        accelerator (`Accelerator`):
            An `Accelerator` object
        batch_size (`int`, *optional*):
            The batch size for the train and validation DataLoaders.
        model_name (`str`, *optional*):
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    datasets = load_dataset("glue", "mrpc")

    def tokenize_function(examples):
        # max_length=None => use the model max length (it's actually the default)
        outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
        return outputs

    # Apply the method we just defined to all the examples in all the splits of the dataset
    tokenized_datasets = datasets.map(
        tokenize_function, batched=True, remove_columns=["idx", "sentence1", "sentence2"], load_from_cache_file=False
    )

    # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
    # transformers library
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

    def collate_fn(examples):
        # On TPU it's best to pad everything to the same length or training will be very slow.
        if accelerator.distributed_type == DistributedType.XLA:
            return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
        return tokenizer.pad(examples, padding="longest", return_tensors="pt")

    # Instantiate dataloaders.
    train_dataloader = DataLoader(
        tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size
    )
    eval_dataloader = DataLoader(
        tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
    )

    return train_dataloader, eval_dataloader


def training_function(config, args):
    accelerator_kwargs = {}
    # need this for DeepSpeed tests as `args.tp_size` would be None and `torch.distributed.init_device_mesh` would fail
    if args.tp_size is not None:
        accelerator_kwargs["parallelism_config"] = ParallelismConfig(tp_size=args.tp_size)

    # Initialize accelerator
    accelerator = Accelerator(**accelerator_kwargs)

    # Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
    lr = config["lr"]
    num_epochs = int(config["num_epochs"])
    seed = int(config["seed"])
    batch_size = int(config["batch_size"])
    model_name = args.model_name_or_path

    set_seed(seed)
    train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size, model_name)

    # Add TP related kwargs if provided
    model_kwargs = {}
    if args.tp_plan is not None:
        model_kwargs["tp_plan"] = args.tp_plan
    if args.tp_size is not None:
        model_kwargs["tp_size"] = args.tp_size

    # Instantiate the model (we build the model here so that the seed also control new weights initialization)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True, **model_kwargs)

    if args.add_pad_token:
        if model.config.pad_token_id is None:
            model.config.pad_token_id = 0

    # Instantiate optimizer
    optimizer_cls = (
        AdamW
        if accelerator.state.deepspeed_plugin is None
        or "optimizer" not in accelerator.state.deepspeed_plugin.deepspeed_config
        else DummyOptim
    )
    optimizer = optimizer_cls(params=model.parameters(), lr=lr)

    max_training_steps = len(train_dataloader) * num_epochs

    # Instantiate scheduler
    linear_decay_scheduler = False
    if (
        accelerator.state.deepspeed_plugin is None
        or "scheduler" not in accelerator.state.deepspeed_plugin.deepspeed_config
    ):
        lr_scheduler = get_linear_schedule_with_warmup(
            optimizer=optimizer,
            num_warmup_steps=0,
            num_training_steps=max_training_steps,
        )
        linear_decay_scheduler = True
    else:
        lr_scheduler = DummyScheduler(optimizer, total_num_steps=max_training_steps, warmup_num_steps=0)

    # Prepare everything
    # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
    # prepare method.
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
    )

    # We also need to keep track of the stating epoch so files are named properly
    starting_epoch = 0

    # Now we train the model
    metric = evaluate.load("glue", "mrpc")
    best_performance = 0
    performance_metric = {}
    expected_lr_after_first_optim_step = lr * (
        1 - 1 / (max_training_steps / accelerator.num_processes / accelerator.gradient_accumulation_steps)
    )
    lr_scheduler_check_completed = False
    for epoch in range(starting_epoch, num_epochs):
        model.train()
        for step, batch in enumerate(train_dataloader):
            with accelerator.accumulate(model):
                outputs = model(**batch)
                loss = outputs.loss
                accelerator.backward(loss)
                context = nullcontext
                if args.tp_plan is not None:
                    from torch.distributed._tensor.experimental import implicit_replication

                    context = implicit_replication
                with context():
                    optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()

                # assert the learning rate after first optimizer step
                if (
                    accelerator.sync_gradients
                    and not lr_scheduler_check_completed
                    and linear_decay_scheduler
                    and accelerator.state.mixed_precision == "no"
                ):
                    assert lr_scheduler.get_last_lr()[0] == expected_lr_after_first_optim_step, (
                        f"Wrong lr found at second step, expected {expected_lr_after_first_optim_step}, got {lr_scheduler.get_last_lr()[0]}"
                    )
                    lr_scheduler_check_completed = True

        model.eval()
        samples_seen = 0
        for step, batch in enumerate(eval_dataloader):
            # We could avoid this line since we set the accelerator with `device_placement=True`.
            batch.to(accelerator.device)
            with torch.no_grad():
                outputs = model(**batch)
            predictions = outputs.logits.argmax(dim=-1)
            # It is slightly faster to call this once, than multiple times
            predictions, references = accelerator.gather(
                (predictions, batch["labels"])
            )  # If we are in a multiprocess environment, the last batch has duplicates
            if accelerator.use_distributed:
                if step == len(eval_dataloader) - 1:
                    predictions = predictions[: len(eval_dataloader.dataset) - samples_seen]
                    references = references[: len(eval_dataloader.dataset) - samples_seen]
                else:
                    samples_seen += references.shape[0]
            metric.add_batch(
                predictions=predictions,
                references=references,
            )

        eval_metric = metric.compute()
        # Use accelerator.print to print only on the main process.
        accelerator.print(f"epoch {epoch}:", eval_metric)
        performance_metric[f"epoch-{epoch}"] = eval_metric["accuracy"]

        if best_performance < eval_metric["accuracy"]:
            best_performance = eval_metric["accuracy"]

    # check that the LR is 0
    if linear_decay_scheduler and accelerator.state.mixed_precision == "no":
        assert lr_scheduler.get_last_lr()[0] == 0, (
            f"Wrong lr found at last step, expected 0, got {lr_scheduler.get_last_lr()[0]}"
        )

    if args.performance_lower_bound is not None:
        assert args.performance_lower_bound <= best_performance, (
            f"Best performance metric {best_performance} is lower than the lower bound {args.performance_lower_bound}"
        )

    accelerator.wait_for_everyone()
    if accelerator.is_main_process:
        with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
            json.dump(performance_metric, f)

    # TODO: skip saving of the model test for TP until the feature lands
    if args.tp_plan is None:
        # Finally try saving the model
        accelerator.save_model(model, args.output_dir)
    accelerator.wait_for_everyone()
    if args.tp_plan is None:
        assert Path(args.output_dir, SAFE_WEIGHTS_NAME).exists(), (
            "Model was not saved when calling `Accelerator.save_model`"
        )
    accelerator.end_training()


def main():
    parser = argparse.ArgumentParser(description="Simple example of training script tracking peak GPU memory usage.")
    parser.add_argument(
        "--model_name_or_path",
        type=str,
        default="bert-base-cased",
        help="Path to pretrained model or model identifier from huggingface.co/models.",
        required=False,
    )
    parser.add_argument(
        "--output_dir",
        type=str,
        default=".",
        help="Optional save directory where all checkpoint folders will be stored. Default is the current working directory.",
    )
    parser.add_argument(
        "--performance_lower_bound",
        type=float,
        default=None,
        help="Optional lower bound for the performance metric. If set, the training will throw error when the performance metric drops below this value.",
    )
    parser.add_argument(
        "--num_epochs",
        type=int,
        default=3,
        help="Number of train epochs.",
    )
    parser.add_argument(
        "--add_pad_token",
        type=bool,
        default=False,
        help="To add pad token if not exists.",
    )
    parser.add_argument(
        "--tp_plan",
        type=str,
        default=None,
        help="pass 'auto' to use TP",
    )
    parser.add_argument(
        "--tp_size",
        type=int,
        default=None,
        help="TP size to be used to shard the model",
    )
    args = parser.parse_args()
    config = {"lr": 2e-5, "num_epochs": args.num_epochs, "seed": 42, "batch_size": 16}
    training_function(config, args)


if __name__ == "__main__":
    main()


================================================
FILE: src/accelerate/test_utils/scripts/external_deps/test_pippy.py
================================================
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from transformers import (
    BertConfig,
    BertForMaskedLM,
    GPT2Config,
    GPT2ForSequenceClassification,
)

from accelerate import PartialState
from accelerate.inference import prepare_pippy
from accelerate.test_utils import torch_device
from accelerate.utils import DistributedType, set_seed


model_to_config = {
    "bert": (BertForMaskedLM, BertConfig, 512),
    "gpt2": (GPT2ForSequenceClassification, GPT2Config, 1024),
}


def get_model_and_data_for_text(model_name, device, num_processes: int = 2):
    initializer, config, seq_len = model_to_config[model_name]
    config_args = {}
    # Eventually needed for batch inference tests on gpt-2 when bs != 1
    # if model_name == "gpt2":
    #     config_args["pad_token_id"] = 0
    model_config = config(**config_args)
    model = initializer(model_config)
    kwargs = dict(low=0, high=model_config.vocab_size, device=device, dtype=torch.int64, requires_grad=False)
    trace_input = torch.randint(size=(1, seq_len), **kwargs)
    inference_inputs = torch.randint(size=(num_processes, seq_len), **kwargs)
    return model, trace_input, inference_inputs


def test_bert(batch_size: int = 2):
    set_seed(42)
    state = PartialState()
    model, trace_input, inference_inputs = get_model_and_data_for_text("bert", "cpu", batch_size)
    model = prepare_pippy(model, example_args=(trace_input,), no_split_module_classes=model._no_split_modules)
    # For inference args need to be a tuple
    inputs = inference_inputs.to(torch_device)
    with torch.no_grad():
        output = model(inputs)
    # Zach: Check that we just grab the real outputs we need at the end
    if not state.is_last_process:
        assert output is None, "Output was not generated on just the last process!"
    else:
        assert output is not None, "Output was not generated in the last process!"


def test_gpt2(batch_size: int = 2):
    set_seed(42)
    state = PartialState()
    model, trace_input, inference_inputs = get_model_and_data_for_text("gpt2", "cpu", batch_size)
    model = prepare_pippy(model, example_args=(trace_input,), no_split_module_classes=model._no_split_modules)
    # For inference args need to be a tuple
    inputs = inference_inputs.to(torch_device)
    with torch.no_grad():
        output = model(inputs)
    # Zach: Check that we just grab the real outputs we need at the end
    if not state.is_last_process:
        assert output is None, "Output was not generated on just the last process!"
    else:
        assert output is not None, "Output was not generated in the last process!"


# Currently disabled, enable again once PyTorch pippy interface can trace a resnet34
# def test_resnet(batch_size: int = 2):
#     set_seed(42)
#     state = PartialState()
#     model = resnet34()
#     input_tensor = torch.rand(1, 3, 224, 224)
#     model = prepare_pippy(
#         model,
#         example_args=(input_tensor,),
#     )
#     inference_inputs = torch.rand(batch_size, 3, 224, 224)
#     inputs = send_to_device(inference_inputs, torch_device)
#     with torch.no_grad():
#         output = model(inputs)
#     # Zach: Check that we just grab the real outputs we need at the end
#     if not state.is_last_process:
#         assert output is None, "Output was not generated on just the last process!"
#     else:
#         assert output is not None, "Output was not generated in the last process!"


if __name__ == "__main__":
    state = PartialState()
    state.print("Testing pippy integration...")
    try:
        if state.distributed_type in [DistributedType.MULTI_GPU, DistributedType.MULTI_XPU, DistributedType.MULTI_HPU]:
            state.print("Testing GPT2...")
            test_gpt2()
            # Issue: When modifying the tokenizer for batch GPT2 inference, there's an issue
            # due to references
            # NameError: cannot access free variable 'chunk_args_list' where it is not associated with a value in enclosing scope
            # test_gpt2(3)
            state.print("Testing BERT...")
            test_bert()
        else:
            print("Less than two GPUs found, not running tests!")
    finally:
        state.destroy_process_group()


================================================
FILE: src/accelerate/test_utils/scripts/external_deps/test_zero3_integration.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch.distributed

from accelerate.test_utils import require_huggingface_suite, torch_device
from accelerate.utils import is_transformers_available


if is_transformers_available():
    from transformers import AutoModel, TrainingArguments


GPT2_TINY = "sshleifer/tiny-gpt2"


@require_huggingface_suite
def init_torch_dist_then_launch_deepspeed():
    if torch_device == "xpu":
        backend = "xccl"
    elif torch_device == "hpu":
        backend = "hccl"
    else:
        backend = "nccl"

    torch.distributed.init_process_group(backend=backend)
    deepspeed_config = {
        "zero_optimization": {
            "stage": 3,
        },
        "train_batch_size": "auto",
        "train_micro_batch_size_per_gpu": "auto",
    }
    train_args = TrainingArguments(
        output_dir="./",
        deepspeed=deepspeed_config,
    )
    model = AutoModel.from_pretrained(GPT2_TINY)
    assert train_args is not None
    assert model is not None


def main():
    init_torch_dist_then_launch_deepspeed()


if __name__ == "__main__":
    main()


================================================
FILE: src/accelerate/test_utils/scripts/test_cli.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch

from accelerate.utils import is_xpu_available


def main():
    accelerator_type = "GPU"
    num_accelerators = 0
    if torch.cuda.is_available():
        num_accelerators = torch.cuda.device_count()
        accelerator_type = "GPU"
    elif is_xpu_available():
        num_accelerators = torch.xpu.device_count()
        accelerator_type = "XPU"
    print(f"Successfully ran on {num_accelerators} {accelerator_type}s")


if __name__ == "__main__":
    main()


================================================
FILE: src/accelerate/test_utils/scripts/test_ddp_comm_hook.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch

from accelerate import Accelerator, DDPCommunicationHookType, DistributedDataParallelKwargs, PartialState
from accelerate.utils import is_hpu_available


class MockModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        torch.manual_seed(0)
        self.p = torch.nn.Parameter(torch.randn(40, 20))

    def forward(self, x, rank):
        return self.p * (x ** (1 + rank))


def _run_and_get_grads(model, rank):
    torch.manual_seed(2024)
    input = torch.randn(40, 20)
    output = model(input, rank)
    output.mean().backward()
    param = next(model.parameters())
    return param.grad


def test_ddp_comm_hook(comm_hook, comm_wrapper, comm_state_option):
    ddp_kwargs = DistributedDataParallelKwargs(
        comm_hook=comm_hook,
        comm_wrapper=comm_wrapper,
        comm_state_option=comm_state_option,
    )
    accelerator = Accelerator(kwargs_handlers=[ddp_kwargs])

    model = accelerator.prepare(MockModel())
    hook_grads = _run_and_get_grads(model, accelerator.local_process_index)

    reference_model = torch.nn.parallel.DistributedDataParallel(
        MockModel().to(accelerator.device),
        device_ids=[accelerator.local_process_index],
        output_device=accelerator.local_process_index,
    )
    reference_grads = _run_and_get_grads(reference_model, accelerator.local_process_index)

    torch.testing.assert_close(hook_grads, reference_grads, rtol=1e-2, atol=1e-2)


def main():
    for comm_hook, comm_wrapper, comm_state_option in [
        (DDPCommunicationHookType.NO, DDPCommunicationHookType.NO, {}),
        (DDPCommunicationHookType.FP16, DDPCommunicationHookType.NO, {}),
        (DDPCommunicationHookType.BF16, DDPCommunicationHookType.NO, {}),
        (DDPCommunicationHookType.POWER_SGD, DDPCommunicationHookType.NO, {}),
        (DDPCommunicationHookType.POWER_SGD, DDPCommunicationHookType.FP16, {}),
        (DDPCommunicationHookType.POWER_SGD, DDPCommunicationHookType.BF16, {}),
        (DDPCommunicationHookType.POWER_SGD, DDPCommunicationHookType.NO, {"matrix_approximation_rank": 2}),
        (DDPCommunicationHookType.BATCHED_POWER_SGD, DDPCommunicationHookType.NO, {}),
        (DDPCommunicationHookType.BATCHED_POWER_SGD, DDPCommunicationHookType.FP16, {}),
        (DDPCommunicationHookType.BATCHED_POWER_SGD, DDPCommunicationHookType.BF16, {}),
    ]:
        if is_hpu_available():
            HPU_UNSUPPORTED_COMM_HOOKS = {DDPCommunicationHookType.FP16, DDPCommunicationHookType.BF16}
            if comm_hook in HPU_UNSUPPORTED_COMM_HOOKS or comm_wrapper in HPU_UNSUPPORTED_COMM_HOOKS:
                print(f"Skipping test DDP comm hook: {comm_hook}, comm wrapper: {comm_wrapper} on HPU")
                continue

        print(f"Test DDP comm hook: {comm_hook}, comm wrapper: {comm_wrapper}")
        test_ddp_comm_hook(comm_hook, comm_wrapper, comm_state_option)
    PartialState().destroy_process_group()


if __name__ == "__main__":
    main()


================================================
FILE: src/accelerate/test_utils/scripts/test_distributed_data_loop.py
================================================
#!/usr/bin/env python

# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pickle
import tempfile
import warnings
from unittest.mock import Mock

import torch
from torch.utils.data import (
    BatchSampler,
    DataLoader,
    Dataset,
    IterableDataset,
    RandomSampler,
    TensorDataset,
    default_collate,
)

from accelerate.accelerator import Accelerator, DataLoaderConfiguration
from accelerate.utils.dataclasses import DistributedType


NUM_ELEMENTS = 22
NUM_WORKERS = 4
BATCH_SIZE = 4


class DummyDataset(Dataset):
    def __len__(self):
        return NUM_ELEMENTS

    def __getitem__(self, index):
        squeeze = False

        if isinstance(index, int):
            index = [index]
            squeeze = True
        elif isinstance(index, slice):
            index = list(range(*index.indices(self.size)))
        else:
            index = list(index)

        batch = [{"index": i, "label": i % 2, "random_augmentation": torch.rand(1).item()} for i in index]

        if squeeze:
            batch = batch[0]

        return batch


class DummyIterableDataset(IterableDataset):
    def __init__(self, data):
        self.data = data

    def __iter__(self):
        yield from self.data


def create_accelerator(even_batches=True):
    dataloader_config = DataLoaderConfiguration(even_batches=even_batches)
    accelerator = Accelerator(dataloader_config=dataloader_config)
    assert accelerator.num_processes == 2, "this script expects that two GPUs are available"
    return accelerator


def create_dataloader(
    accelerator: Accelerator, dataset_size: int, batch_size: int, iterable: bool = False, shuffle: bool = False
):
    """
    Create a simple DataLoader to use during the test cases
    """
    values = torch.as_tensor(range(dataset_size))
    if shuffle:
        values = values[torch.randperm(values.size(0))]
    if iterable:
        dataset = DummyIterableDataset(values)
    else:
        dataset = TensorDataset(torch.as_tensor(range(dataset_size)))

    dl = DataLoader(dataset, batch_size=batch_size)
    dl = accelerator.prepare(dl)

    return dl


def verify_dataloader_batch_sizes(
    accelerator: Accelerator,
    dataset_size: int,
    batch_size: int,
    process_0_expected_batch_sizes: list[int],
    process_1_expected_batch_sizes: list[int],
):
    """
    A helper function for verifying the batch sizes coming from a prepared dataloader in each process
    """
    dl = create_dataloader(accelerator=accelerator, dataset_size=dataset_size, batch_size=batch_size)

    batch_sizes = [len(batch[0]) for batch in dl]

    if accelerator.process_index == 0:
        assert batch_sizes == process_0_expected_batch_sizes
    elif accelerator.process_index == 1:
        assert batch_sizes == process_1_expected_batch_sizes


def test_default_ensures_even_batch_sizes():
    accelerator = create_accelerator()

    # without padding, we would expect a different number of batches
    verify_dataloader_batch_sizes(
        accelerator,
        dataset_size=3,
        batch_size=1,
        process_0_expected_batch_sizes=[1, 1],
        process_1_expected_batch_sizes=[1, 1],
    )

    # without padding, we would expect the same number of batches, but different sizes
    verify_dataloader_batch_sizes(
        accelerator,
        dataset_size=7,
        batch_size=2,
        process_0_expected_batch_sizes=[2, 2],
        process_1_expected_batch_sizes=[2, 2],
    )


def test_can_disable_even_batches():
    accelerator = create_accelerator(even_batches=False)

    verify_dataloader_batch_sizes(
        accelerator,
        dataset_size=3,
        batch_size=1,
        process_0_expected_batch_sizes=[1, 1],
        process_1_expected_batch_sizes=[1],
    )

    verify_dataloader_batch_sizes(
        accelerator,
        dataset_size=7,
        batch_size=2,
        process_0_expected_batch_sizes=[2, 2],
        process_1_expected_batch_sizes=[2, 1],
    )


def test_can_join_uneven_inputs():
    accelerator = create_accelerator(even_batches=False)

    model = torch.nn.Linear(1, 1)
    ddp_model = accelerator.prepare(model)

    dl = create_dataloader(accelerator, dataset_size=3, batch_size=1)

    batch_idxs = []
    with accelerator.join_uneven_inputs([ddp_model]):
        for batch_idx, batch in enumerate(dl):
            output = ddp_model(batch[0].float())
            loss = output.sum()
            loss.backward()
            batch_idxs.append(batch_idx)

    accelerator.wait_for_everyone()

    if accelerator.process_index == 0:
        assert batch_idxs == [0, 1]
    elif accelerator.process_index == 1:
        assert batch_idxs == [0]


def test_join_raises_warning_for_non_ddp_distributed(accelerator):
    with warnings.catch_warnings(record=True) as w:
        with accelerator.join_uneven_inputs([Mock()]):
            pass

        assert issubclass(w[-1].category, UserWarning)
        assert "only supported for multi-GPU" in str(w[-1].message)


def test_join_can_override_even_batches():
    default_even_batches = True
    overridden_even_batches = False
    accelerator = create_accelerator(even_batches=default_even_batches)
    model = torch.nn.Linear(1, 1)
    ddp_model = accelerator.prepare(model)
    train_dl = create_dataloader(accelerator, dataset_size=3, batch_size=1)
    valid_dl = create_dataloader(accelerator, dataset_size=3, batch_size=1)

    with accelerator.join_uneven_inputs([ddp_model], even_batches=overridden_even_batches):
        train_dl_overridden_value = train_dl.batch_sampler.even_batches
        valid_dl_overridden_value = valid_dl.batch_sampler.even_batches

    assert train_dl_overridden_value == overridden_even_batches
    assert valid_dl_overridden_value == overridden_even_batches
    assert train_dl.batch_sampler.even_batches == default_even_batches
    assert valid_dl.batch_sampler.even_batches == default_even_batches


def test_join_can_override_for_mixed_type_dataloaders():
    default_even_batches = True
    overridden_even_batches = False
    accelerator = create_accelerator(even_batches=default_even_batches)
    model = torch.nn.Linear(1, 1)
    ddp_model = accelerator.prepare(model)
    create_dataloader(accelerator, dataset_size=3, batch_size=1, iterable=True)
    batch_dl = create_dataloader(accelerator, dataset_size=3, batch_size=1)

    with warnings.catch_warnings():
        warnings.filterwarnings("ignore")
        try:
            with accelerator.join_uneven_inputs([ddp_model], even_batches=overridden_even_batches):
                batch_dl_overridden_value = batch_dl.batch_sampler.even_batches
        except AttributeError:
            # ensure attribute error is not raised when processing iterable dl
            raise AssertionError

    assert batch_dl_overridden_value == overridden_even_batches
    assert batch_dl.batch_sampler.even_batches == default_even_batches


def test_join_raises_warning_for_iterable_when_overriding_even_batches():
    accelerator = create_accelerator()
    model = torch.nn.Linear(1, 1)
    ddp_model = accelerator.prepare(model)
    create_dataloader(accelerator, dataset_size=3, batch_size=1, iterable=True)

    with warnings.catch_warnings(record=True) as w:
        with accelerator.join_uneven_inputs([ddp_model], even_batches=False):
            pass

        assert issubclass(w[-1].category, UserWarning)
        assert "only supported for map-style datasets" in str(w[-1].message)


def test_pickle_accelerator():
    accelerator = create_accelerator()
    data_loader = create_dataloader(accelerator, dataset_size=32, batch_size=4)
    _ = accelerator.prepare(data_loader)
    pickled_accelerator = pickle.dumps(accelerator)
    unpickled_accelerator = pickle.loads(pickled_accelerator)
    # TODO: Maybe this should be implemented as __eq__ for AcceleratorState?
    assert accelerator.state.__dict__ == unpickled_accelerator.state.__dict__


def test_data_loader(data_loader, accelerator):
    # Prepare the DataLoader
    data_loader = accelerator.prepare(data_loader)

    all_examples = []
    for i, batch in enumerate(data_loader):
        index, _ = accelerator.gather_for_metrics((batch["index"], batch["label"]))
        all_examples.extend(index.detach().cpu().numpy().tolist())

    # Sort the examples
    sorted_all_examples = sorted(all_examples)

    # Check if all elements are present in the sorted list of iterated samples
    assert len(set(sorted_all_examples)) == NUM_ELEMENTS, (
        "Not all the dataset elements have been iterated in an epoch due to duplication of samples across processes."
    )


def _test_stateful_dataloader_resume(accelerator, iterable):
    """
    Helper: iterate a stateful dataloader, save state after a few batches using `load_state_dict`,
    resume from the saved state, and verify the resumed batches match what was originally unseen.

    Saves early (after 3 batches) so many batches remain, exposing any off-by-one in state restoration.
    Tested with both iterable and map-style datasets to cover different state_dict code paths.
    """
    old_dataloader_config = accelerator.dataloader_config
    try:
        accelerator.dataloader_config = DataLoaderConfiguration(use_stateful_dataloader=True)
        prepared_dl = create_dataloader(
            accelerator, dataset_size=32 * accelerator.num_processes, batch_size=4, iterable=iterable, shuffle=True
        )
        untrained_batches = []
        save_step = 2
        for step, batch in enumerate(prepared_dl):
            if step == save_step:
                state_dict = prepared_dl.state_dict()
            if step > save_step:
                untrained_batches.append(batch)
        not_skipped_batches = accelerator.gather(untrained_batches)
        prepared_dl.load_state_dict(state_dict)
        resumed_batches = []
        for batch in prepared_dl:
            resumed_batches.append(batch)
        resumed_batches = accelerator.gather(resumed_batches)
        assert len(not_skipped_batches) == len(resumed_batches), (
            f"Expected {len(not_skipped_batches)} batches after resume, got {len(resumed_batches)}"
        )
        for b1, b2 in zip(not_skipped_batches, resumed_batches):
            for v1, v2 in zip(b1, b2):
                assert torch.equal(v1, v2), f"Batch {b1} and {b2} are not equal"
    finally:
        accelerator.dataloader_config = old_dataloader_config


def test_stateful_dataloader(accelerator):
    """
    Tests that a stateful dataloader can be iterated over, saved after a few batches using `load_state_dict`, and then
    resumed from the saved state.

    The result should be the same as the rest of the data that iterated over after saving.
    """
    _test_stateful_dataloader_resume(accelerator, iterable=True)
    _test_stateful_dataloader_resume(accelerator, iterable=False)


def _test_stateful_dataloader_save_state_resume(accelerator, iterable):
    """
    Helper: iterate a stateful dataloader, save state after a few batches using `Accelerator.save_state`,
    resume, and verify the resumed batches match what was originally unseen.
    """
    old_dataloader_config = accelerator.dataloader_config
    try:
        with tempfile.TemporaryDirectory() as tmpdir:
            accelerator.dataloader_config = DataLoaderConfiguration(use_stateful_dataloader=True)
            prepared_dl = create_dataloader(
                accelerator, dataset_size=32 * accelerator.num_processes, batch_size=4, iterable=iterable, shuffle=True
            )
            untrained_batches = []
            save_step = 2
            for step, batch in enumerate(prepared_dl):
                if step == save_step:
                    accelerator.save_state(tmpdir)
                if step > save_step:
                    untrained_batches.append(batch)
            not_skipped_batches = accelerator.gather(untrained_batches)
            accelerator.load_state(tmpdir)
            resumed_batches = []
            for batch in prepared_dl:
                resumed_batches.append(batch)
            resumed_batches = accelerator.gather(resumed_batches)
            assert len(not_skipped_batches) == len(resumed_batches), (
                f"Expected {len(not_skipped_batches)} batches after resume, got {len(resumed_batches)}"
            )
            for b1, b2 in zip(not_skipped_batches, resumed_batches):
                for v1, v2 in zip(b1, b2):
                    assert torch.equal(v1, v2), f"Batch {b1} and {b2} are not equal"
    finally:
        accelerator.dataloader_config = old_dataloader_config


def test_stateful_dataloader_save_state(accelerator):
    """
    Tests that a stateful dataloader can be iterated over, saved after a few batches using `Accelerator.save_state`,
    and then resumed from the saved state.

    The result should be the same as the rest of the data that iterated over after saving.
    """
    _test_stateful_dataloader_save_state_resume(accelerator, iterable=True)
    _test_stateful_dataloader_save_state_resume(accelerator, iterable=False)


def main():
    accelerator = create_accelerator()
    torch.manual_seed(accelerator.process_index)

    accelerator.print("Test that even_batches variable ensures uniform batches across processes")
    test_default_ensures_even_batch_sizes()

    accelerator.print("Run tests with even_batches disabled")
    test_can_disable_even_batches()

    accelerator.print("Test joining uneven inputs")
    test_can_join_uneven_inputs()

    accelerator.print("Test overriding even_batches when joining uneven inputs")
    test_join_can_override_even_batches()

    accelerator.print("Test overriding even_batches for mixed dataloader types")
    test_join_can_override_for_mixed_type_dataloaders()

    accelerator.print("Test overriding even_batches raises a warning for iterable dataloaders")
    test_join_raises_warning_for_iterable_when_overriding_even_batches()

    accelerator.print("Test join with non DDP distributed raises warning")
    original_state = accelerator.state.distributed_type
    accelerator.state.distributed_type = DistributedType.FSDP
    test_join_raises_warning_for_non_ddp_distributed(accelerator)
    accelerator.state.distributed_type = original_state

    accelerator.print("Test pickling an accelerator")
    test_pickle_accelerator()

    dataset = DummyDataset()

    accelerator.print("Test DataLoader with shuffle=False")
    loader = DataLoader(dataset, shuffle=False, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS)
    test_data_loader(loader, accelerator)

    accelerator.print("Test DataLoader with shuffle=True")
    loader = DataLoader(dataset, shuffle=True, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS)
    test_data_loader(loader, accelerator)

    accelerator.print("Test DataLoader with batch_sampler")
    sampler = BatchSampler(RandomSampler(dataset), batch_size=BATCH_SIZE, drop_last=False)
    loader = DataLoader(dataset, batch_sampler=sampler, num_workers=NUM_WORKERS)
    test_data_loader(loader, accelerator)

    accelerator.print("Test DataLoader with sampler as an instance of `BatchSampler`")
    sampler = BatchSampler(RandomSampler(dataset), batch_size=BATCH_SIZE, drop_last=False)
    loader = DataLoader(dataset, sampler=sampler, batch_size=None, collate_fn=default_collate, num_workers=NUM_WORKERS)
    test_data_loader(loader, accelerator)
    test_stateful_dataloader(accelerator)
    test_stateful_dataloader_save_state(accelerator)

    accelerator.end_training()


if __name__ == "__main__":
    main()


================================================
FILE: src/accelerate/test_utils/scripts/test_merge_weights.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import gc
import logging
import shutil
from pathlib import Path

import torch
from safetensors.torch import load_file
from torch.distributed.fsdp.fully_sharded_data_parallel import ShardingStrategy, StateDictType
from torch.utils.data import DataLoader

from accelerate import Accelerator, FullyShardedDataParallelPlugin
from accelerate.commands.merge import merge_command, merge_command_parser
from accelerate.state import AcceleratorState
from accelerate.test_utils import torch_device
from accelerate.test_utils.training import RegressionDataset
from accelerate.utils import merge_fsdp_weights, patch_environment, save_fsdp_model


logging.basicConfig(level=logging.INFO)

parser = merge_command_parser()


class TinyModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = torch.nn.Linear(16, 16)
        self.activation = torch.nn.ReLU()
        self.linear2 = torch.nn.Linear(16, 16)
        self.softmax = torch.nn.Softmax()

    def forward(self, x):
        return self.linear2(self.activation(self.linear1(x)))


def setup():
    if AcceleratorState._shared_state != {}:
        AcceleratorState()._reset_state()
    plugin = FullyShardedDataParallelPlugin(
        sharding_strategy=ShardingStrategy.FULL_SHARD, state_dict_type=StateDictType.SHARDED_STATE_DICT
    )
    model = TinyModel()
    with patch_environment(fsdp_auto_wrap_policy="SIZE_BASED_WRAP"):
        plugin.set_auto_wrap_policy(model)
    accelerator = Accelerator(fsdp_plugin=plugin)
    model = accelerator.prepare(model)
    return model, plugin, accelerator


def mock_training(accelerator, model):
    train_set = RegressionDataset(length=128, seed=42)
    train_dl = DataLoader(train_set, batch_size=16, shuffle=False)
    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

    train_dl, model, optimizer = accelerator.prepare(train_dl, model, optimizer)
    for _ in range(3):
        for batch in train_dl:
            model.zero_grad()
            output = model(batch["x"])
            loss = torch.nn.functional.mse_loss(output, batch["y"])
            accelerator.backward(loss)
            optimizer.step()
    return model


def check_weights(operation, state_1, state_2):
    for weight_1, weight_2 in zip(state_1.values(), state_2.values()):
        if operation == "same":
            assert torch.allclose(weight_1, weight_2)
        else:
            assert not torch.allclose(weight_1, weight_2)


def check_safetensors_weights(path, model):
    safe_state_dict = load_file(path / "model.safetensors")
    safe_loaded_model = TinyModel().to(torch_device)
    check_weights("diff", model.state_dict(), safe_loaded_model.state_dict())
    safe_loaded_model.load_state_dict(safe_state_dict)
    check_weights("same", model.state_dict(), safe_loaded_model.state_dict())


def check_pytorch_weights(path, model):
    nonsafe_state_dict = torch.load(path / "pytorch_model.bin", weights_only=True)
    nonsafe_loaded_model = TinyModel().to(torch_device)
    check_weights("diff", model.state_dict(), nonsafe_loaded_model.state_dict())
    nonsafe_loaded_model.load_state_dict(nonsafe_state_dict)
    check_weights("same", model.state_dict(), nonsafe_loaded_model.state_dict())


def test_merge_weights_safetensors(model, path):
    # Should now be saved at `path/merged.safetensors`
    merge_fsdp_weights(path / "pytorch_model_fsdp_0", path, safe_serialization=True)
    check_safetensors_weights(path, model)


def test_merge_weights_command_safetensors(model, path):
    args = parser.parse_args([str(path / "pytorch_model_fsdp_0"), str(path)])
    merge_command(args)
    check_safetensors_weights(path, model)


def test_merge_weights_pytorch(model, path):
    # Should now be saved at `path/merged.bin`
    merge_fsdp_weights(path / "pytorch_model_fsdp_0", path, safe_serialization=False)
    check_pytorch_weights(path, model)


def test_merge_weights_command_pytorch(model, path):
    args = parser.parse_args([str(path / "pytorch_model_fsdp_0"), str(path), "--unsafe_serialization"])
    merge_command(args)
    check_pytorch_weights(path, model)


if __name__ == "__main__":
    # Note this test requires at least two accelerators!
    model, plugin, accelerator = setup()
    if accelerator.num_processes > 1:
        try:
            # Initial setup for things
            out_path = Path("test_merge_weights_fsdp_weights")
            if not out_path.exists():
                out_path.mkdir(parents=True, exist_ok=True)

            # Train briefly once weights aren't the baseline
            model = mock_training(accelerator, model)
            accelerator.wait_for_everyone()

            gc.collect()  # Needed for some lingering refs after training
            save_fsdp_model(plugin, accelerator, model, out_path)
            accelerator.wait_for_everyone()

            # Finally we can test
            test_merge_weights_safetensors(model, out_path)
            test_merge_weights_command_safetensors(model, out_path)
            test_merge_weights_pytorch(model, out_path)
            test_merge_weights_command_pytorch(model, out_path)
        except Exception:
            raise
        finally:
            # Cleanup in case of any failures
            if accelerator.is_main_process:
                shutil.rmtree(out_path)
            accelerator.wait_for_everyone()
            accelerator.end_training()


================================================
FILE: src/accelerate/test_utils/scripts/test_notebook.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Test file to ensure that in general certain situational setups for notebooks work.
"""

import os
import time

from pytest import mark, raises
from torch.distributed.elastic.multiprocessing.errors import ChildFailedError

from accelerate import PartialState, notebook_launcher
from accelerate.test_utils import require_bnb
from accelerate.utils import is_bnb_available, is_xpu_available


def basic_function():
    # Just prints the PartialState
    print(f"PartialState:\n{PartialState()}")


def tough_nut_function(queue):
    if queue.empty():
        return
    trial = queue.get()
    if trial > 0:
        queue.put(trial - 1)
        raise RuntimeError("The nut hasn't cracked yet! Try again.")

    print(f"PartialState:\n{PartialState()}")


def bipolar_sleep_function(sleep_sec: int):
    state = PartialState()
    if state.process_index % 2 == 0:
        raise RuntimeError("I'm an even process. I don't like to sleep.")
    else:
        time.sleep(sleep_sec)


NUM_PROCESSES = int(os.environ.get("ACCELERATE_NUM_PROCESSES", 1))


def test_can_initialize():
    notebook_launcher(basic_function, (), num_processes=NUM_PROCESSES)


@mark.skipif(NUM_PROCESSES < 2, reason="Need at least 2 processes to test static rendezvous backends")
def test_static_rdzv_backend():
    notebook_launcher(basic_function, (), num_processes=NUM_PROCESSES, rdzv_backend="static")


@mark.skipif(NUM_PROCESSES < 2, reason="Need at least 2 processes to test c10d rendezvous backends")
def test_c10d_rdzv_backend():
    notebook_launcher(basic_function, (), num_processes=NUM_PROCESSES, rdzv_backend="c10d")


@mark.skipif(NUM_PROCESSES < 2, reason="Need at least 2 processes to test fault tolerance")
def test_fault_tolerant(max_restarts: int = 3):
    # Use torch.multiprocessing to get the right context for the current device
    import torch.multiprocessing as mp

    # Get appropriate context - 'spawn' for XPU, 'fork' for others
    if is_xpu_available():
        ctx = mp.get_context("spawn")
    else:
        ctx = mp.get_context("fork")
    queue = ctx.Queue()
    queue.put(max_restarts)
    notebook_launcher(tough_nut_function, (queue,), num_processes=NUM_PROCESSES, max_restarts=max_restarts)


@mark.skipif(NUM_PROCESSES < 2, reason="Need at least 2 processes to test monitoring")
def test_monitoring(monitor_interval: float = 0.01, sleep_sec: int = 100):
    start_time = time.time()
    with raises(ChildFailedError, match="I'm an even process. I don't like to sleep."):
        notebook_launcher(
            bipolar_sleep_function,
            (sleep_sec,),
            num_processes=NUM_PROCESSES,
            monitor_interval=monitor_interval,
        )
    assert time.time() - start_time < sleep_sec, "Monitoring did not stop the process in time."


@require_bnb
def test_problematic_imports():
    with raises(RuntimeError, match="Please keep these imports"):
        import bitsandbytes as bnb  # noqa: F401

        notebook_launcher(basic_function, (), num_processes=NUM_PROCESSES)


def main():
    print("Test basic notebook can be ran")
    test_can_initialize()
    print("Test static rendezvous backend")
    test_static_rdzv_backend()
    print("Test c10d rendezvous backend")
    test_c10d_rdzv_backend()
    print("Test fault tolerant")
    test_fault_tolerant()
    print("Test monitoring")
    test_monitoring()
    if is_bnb_available():
        print("Test problematic imports (bnb)")
        test_problematic_imports()
    if NUM_PROCESSES > 1:
        PartialState().destroy_process_group()


if __name__ == "__main__":
    main()


================================================
FILE: src/accelerate/test_utils/scripts/test_ops.py
================================================
#!/usr/bin/env python

# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch

from accelerate import PartialState
from accelerate.test_utils.testing import assert_exception
from accelerate.utils.dataclasses import DistributedType
from accelerate.utils.operations import (
    DistributedOperationException,
    broadcast,
    copy_tensor_to_devices,
    gather,
    gather_object,
    pad_across_processes,
    reduce,
)


def create_tensor(state):
    return (torch.arange(state.num_processes) + 1.0 + (state.num_processes * state.process_index)).to(state.device)


def test_gather(state):
    tensor = create_tensor(state)
    gathered_tensor = gather(tensor)
    assert gathered_tensor.tolist() == list(range(1, state.num_processes**2 + 1))


def test_gather_object(state):
    # Gather objects in TorchXLA is not supported.
    if state.distributed_type == DistributedType.XLA:
        return
    obj = [state.process_index]
    gathered_obj = gather_object(obj)
    assert len(gathered_obj) == state.num_processes, f"{gathered_obj}, {len(gathered_obj)} != {state.num_processes}"
    assert gathered_obj == list(range(state.num_processes)), f"{gathered_obj} != {list(range(state.num_processes))}"


def test_gather_non_contiguous(state):
    # Skip this test because the 'is_contiguous' function of XLA tensor always returns True.
    if state.distributed_type == DistributedType.XLA:
        return

    # Create a non-contiguous tensor (enforce non-contiguity after device memory allocation)
    tensor = torch.arange(12, device=state.device).view(4, 3).t()
    assert not tensor.is_contiguous()
    # Shouldn't error out
    _ = gather(tensor)


def test_broadcast(state):
    tensor = create_tensor(state)
    broadcasted_tensor = broadcast(tensor)
    assert broadcasted_tensor.shape == torch.Size([state.num_processes])
    assert broadcasted_tensor.tolist() == list(range(1, state.num_processes + 1))


def test_pad_across_processes(state):
    # We need to pad the tensor with one more element if we are the main process
    # to ensure that we can pad
    if state.is_main_process:
        tensor = torch.arange(state.num_processes + 1).to(state.device)
    else:
        tensor = torch.arange(state.num_processes).to(state.device)
    padded_tensor = pad_across_processes(tensor)
    assert padded_tensor.shape == torch.Size([state.num_processes + 1])
    if not state.is_main_process:
        assert padded_tensor.tolist() == list(range(0, state.num_processes)) + [0]


def test_reduce_sum(state):
    # For now runs on only two processes
    if state.num_processes != 2:
        return
    tensor = create_tensor(state)
    reduced_tensor = reduce(tensor, "sum")
    truth_tensor = torch.tensor([4.0, 6]).to(state.device)
    assert torch.allclose(reduced_tensor, truth_tensor), f"{reduced_tensor} != {truth_tensor}"


def test_reduce_mean(state):
    # For now runs on only two processes
    if state.num_processes != 2:
        return
    tensor = create_tensor(state)
    reduced_tensor = reduce(tensor, "mean")
    truth_tensor = torch.tensor([2.0, 3]).to(state.device)
    assert torch.allclose(reduced_tensor, truth_tensor), f"{reduced_tensor} != {truth_tensor}"


def test_op_checker(state):
    # Must be in a distributed state, and gathering is currently not supported in TorchXLA.
    if state.distributed_type in [DistributedType.NO, DistributedType.XLA]:
        return
    state.debug = True
    # `pad_across_processes`
    if state.process_index == 0:
        data = {"tensor": torch.tensor([[0.0, 1, 2, 3, 4]]).to(state.device)}
    else:
        data = {"tensor": torch.tensor([[[0.0, 1, 2, 3, 4, 5]]]).to(state.device)}

    with assert_exception(DistributedOperationException):
        pad_across_processes(data, dim=0)

    # `reduce`
    if state.process_index == 0:
        data = {"tensor": torch.tensor([[0.0, 1, 2, 3, 4]]).to(state.device)}
    else:
        data = {"tensor": torch.tensor([[[0.0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]]).to(state.device)}

    with assert_exception(DistributedOperationException):
        reduce(data)

    # `broadcast`
    if state.process_index == 0:
        data = {"tensor": torch.tensor([[0.0, 1, 2, 3, 4]]).to(state.device)}
    else:
        data = {"tensor": torch.tensor([[[0.0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]]).to(state.device)}

    with assert_exception(DistributedOperationException):
        broadcast(data)

    state.debug = False


def test_copy_tensor_to_devices(state):
    if state.distributed_type not in [DistributedType.MULTI_GPU, DistributedType.XLA]:
        return
    if state.is_main_process:
        tensor = torch.tensor([1, 2, 3], dtype=torch.int).to(state.device)
    else:
        tensor = None
    tensor = copy_tensor_to_devices(tensor)
    assert torch.allclose(tensor, torch.tensor([1, 2, 3], dtype=torch.int, device=state.device))


def _mp_fn(index):
    # For xla_spawn (TPUs)
    main()


def main():
    state = PartialState()
    state.print(f"State: {state}")
    state.print("testing gather")
    test_gather(state)
    state.print("testing gather_object")
    test_gather_object(state)
    state.print("testing gather non-contiguous")
    test_gather_non_contiguous(state)
    state.print("testing broadcast")
    test_broadcast(state)
    state.print("testing pad_across_processes")
    test_pad_across_processes(state)
    state.print("testing reduce_sum")
    test_reduce_sum(state)
    state.print("testing reduce_mean")
    test_reduce_mean(state)
    state.print("testing op_checker")
    test_op_checker(state)
    state.print("testing sending tensors across devices")
    test_copy_tensor_to_devices(state)
    state.destroy_process_group()


if __name__ == "__main__":
    main()


================================================
FILE: src/accelerate/test_utils/scripts/test_script.py
================================================
#!/usr/bin/env python

# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import contextlib
import io
import math
import time
from copy import deepcopy
from pathlib import Path

import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset

from accelerate import Accelerator
from accelerate.data_loader import SeedableRandomSampler, prepare_data_loader
from accelerate.state import AcceleratorState
from accelerate.test_utils import RegressionDataset, RegressionModel, are_the_same_tensors
from accelerate.utils import (
    DataLoaderConfiguration,
    DistributedType,
    gather,
    gather_object,
    is_bf16_available,
    is_cuda_available,
    is_datasets_available,
    is_fp16_available,
    is_hpu_available,
    is_mps_available,
    is_pytest_available,
    set_seed,
    synchronize_rng_states,
)


if is_hpu_available():
    ATOL = 1e-3
    RTOL = 1e-3
else:
    ATOL = 1e-6
    RTOL = 1e-6


def generate_baseline_dataloader(train_set, generator, batch_size, use_seedable_sampler=False):
    "Creates a dataloader that can also use the `SeedableRandomSampler`"
    if use_seedable_sampler:
        # The SeedableRandomSampler is needed during distributed setups
        # for full reproducibility across processes with the `DataLoader`
        sampler = SeedableRandomSampler(
            generator=generator,
            data_source=train_set,
            num_samples=len(train_set),
        )
        return DataLoader(train_set, batch_size=batch_size, sampler=sampler)
    else:
        return DataLoader(train_set, batch_size=batch_size, shuffle=True, generator=generator)


def print_main(state):
    print(f"Printing from the main process {state.process_index}")


def print_local_main(state):
    print(f"Printing from the local main process {state.local_process_index}")


def print_last(state):
    print(f"Printing from the last process {state.process_index}")


def print_on(state, process_idx):
    print(f"Printing from process {process_idx}: {state.process_index}")


def process_execution_check():
    accelerator = Accelerator()
    num_processes = accelerator.num_processes
    # Test main_process_first context manager
    path = Path("check_main_process_first.txt")
    with accelerator.main_process_first():
        if accelerator.is_main_process:
            time.sleep(0.1)  # ensure main process takes longest
            with open(path, "a+") as f:
                f.write("Currently in the main process\n")
        else:
            with open(path, "a+") as f:
                f.write("Now on another process\n")
    accelerator.wait_for_everyone()

    if accelerator.is_main_process:
        with open(path) as f:
            text = "".join(f.readlines())
        try:
            assert text.startswith("Currently in the main process\n"), "Main process was not first"
            if num_processes > 1:
                assert text.endswith("Now on another process\n"), "Main process was not first"
            assert text.count("Now on another process\n") == accelerator.num_processes - 1, (
                f"Only wrote to file {text.count('Now on another process') + 1} times, not {accelerator.num_processes}"
            )
        except AssertionError:
            path.unlink()
            raise

    if accelerator.is_main_process and path.exists():
        path.unlink()
    accelerator.wait_for_everyone()
    # Test the decorators
    f = io.StringIO()
    with contextlib.redirect_stdout(f):
        accelerator.on_main_process(print_main)(accelerator.state)
    result = f.getvalue().rstrip()
    if accelerator.is_main_process:
        assert result == "Printing from the main process 0", f"{result} != Printing from the main process 0"
    else:
        assert f.getvalue().rstrip() == "", f'{result} != ""'
    f.truncate(0)
    f.seek(0)

    with contextlib.redirect_stdout(f):
        accelerator.on_local_main_process(print_local_main)(accelerator.state)
    if accelerator.is_local_main_process:
        assert f.getvalue().rstrip() == "Printing from the local main process 0"
    else:
        assert f.getvalue().rstrip() == ""
    f.truncate(0)
    f.seek(0)

    with contextlib.redirect_stdout(f):
        accelerator.on_last_process(print_last)(accelerator.state)
    if accelerator.is_last_process:
        assert f.getvalue().rstrip() == f"Printing from the last process {accelerator.state.num_processes - 1}"
    else:
        assert f.getvalue().rstrip() == ""
    f.truncate(0)
    f.seek(0)

    for process_idx in range(num_processes):
        with contextlib.redirect_stdout(f):
            accelerator.on_process(print_on, process_index=process_idx)(accelerator.state, process_idx)
        if accelerator.process_index == process_idx:
            assert f.getvalue().rstrip() == f"Printing from process {process_idx}: {accelerator.process_index}"
        else:
            assert f.getvalue().rstrip() == ""
        f.truncate(0)
        f.seek(0)


def init_state_check():
    # Test we can instantiate this twice in a row.
    state = AcceleratorState()
    if state.local_process_index == 0:
        print("Testing, testing. 1, 2, 3.")
    print(state)


def rng_sync_check():
    state = AcceleratorState()
    synchronize_rng_states(["torch"])
    assert are_the_same_tensors(torch.get_rng_state()), "RNG states improperly synchronized on CPU."
    if state.distributed_type == DistributedType.MULTI_GPU:
        synchronize_rng_states(["cuda"])
        assert are_the_same_tensors(torch.cuda.get_rng_state()), "RNG states improperly synchronized on GPU."
    elif state.distributed_type == DistributedType.MULTI_XPU:
        synchronize_rng_states(["xpu"])
        assert are_the_same_tensors(torch.xpu.get_rng_state()), "RNG states improperly synchronized on XPU."
    generator = torch.Generator()
    synchronize_rng_states(["generator"], generator=generator)
    assert are_the_same_tensors(generator.get_state()), "RNG states improperly synchronized in generator."

    if state.local_process_index == 0:
        print("All rng are properly synched.")


def dl_preparation_check():
    state = AcceleratorState()
    length = 32 * state.num_processes

    dl = DataLoader(range(length), batch_size=8)
    dl = prepare_data_loader(dl, state.device, state.num_processes, state.process_index, put_on_device=True)
    result = []
    for batch in dl:
        result.append(gather(batch))
    result = torch.cat(result)

    assert torch.equal(result.cpu(), torch.arange(0, length).long()), "Wrong non-shuffled dataloader result."

    dl = DataLoader(range(length), batch_size=8)
    dl = prepare_data_loader(
        dl,
        state.device,
        state.num_processes,
        state.process_index,
        put_on_device=True,
        split_batches=True,
    )
    result = []
    for batch in dl:
        result.append(gather(batch))
    result = torch.cat(result)
    assert torch.equal(result.cpu(), torch.arange(0, length).long()), "Wrong non-shuffled dataloader result."

    if state.process_index == 0:
        print("Non-shuffled dataloader passing.")

    dl = DataLoader(range(length), batch_size=8, shuffle=True)
    dl = prepare_data_loader(dl, state.device, state.num_processes, state.process_index, put_on_device=True)
    result = []
    for batch in dl:
        result.append(gather(batch))
    result = torch.cat(result).tolist()
    result.sort()
    assert result == list(range(length)), "Wrong shuffled dataloader result."

    dl = DataLoader(range(length), batch_size=8, shuffle=True)
    dl = prepare_data_loader(
        dl,
        state.device,
        state.num_processes,
        state.process_index,
        put_on_device=True,
        split_batches=True,
    )
    result = []
    for batch in dl:
        result.append(gather(batch))
    result = torch.cat(result).tolist()
    result.sort()
    assert result == list(range(length)), "Wrong shuffled dataloader result."

    if state.local_process_index == 0:
        print("Shuffled dataloader passing.")


def central_dl_preparation_check():
    state = AcceleratorState()
    length = 32 * state.num_processes

    dl = DataLoader(range(length), batch_size=8)
    dl = prepare_data_loader(
        dl, state.device, state.num_processes, state.process_index, put_on_device=True, dispatch_batches=True
    )
    result = []
    for batch in dl:
        result.append(gather(batch))
    result = torch.cat(result)
    assert torch.equal(result.cpu(), torch.arange(0, length).long()), "Wrong non-shuffled dataloader result."

    dl = DataLoader(range(length), batch_size=8)
    dl = prepare_data_loader(
        dl,
        state.device,
        state.num_processes,
        state.process_index,
        put_on_device=True,
        split_batches=True,
        dispatch_batches=True,
    )
    result = []
    for batch in dl:
        result.append(gather(batch))
    result = torch.cat(result)
    assert torch.equal(result.cpu(), torch.arange(0, length).long()), "Wrong non-shuffled dataloader result."

    if state.process_index == 0:
        print("Non-shuffled central dataloader passing.")

    dl = DataLoader(range(length), batch_size=8, shuffle=True)
    dl = prepare_data_loader(
        dl, state.device, state.num_processes, state.process_index, put_on_device=True, dispatch_batches=True
    )
    result = []
    for batch in dl:
        result.append(gather(batch))
    result = torch.cat(result).tolist()
    result.sort()
    assert result == list(range(length)), "Wrong shuffled dataloader result."

    dl = DataLoader(range(length), batch_size=8, shuffle=True)
    dl = prepare_data_loader(
        dl,
        state.device,
        state.num_processes,
        state.process_index,
        put_on_device=True,
        split_batches=True,
        dispatch_batches=True,
    )
    result = []
    for batch in dl:
        result.append(gather(batch))
    result = torch.cat(result).tolist()
    result.sort()
    assert result == list(range(length)), "Wrong shuffled dataloader result."

    if state.local_process_index == 0:
        print("Shuffled central dataloader passing.")


def custom_sampler_check():
    state = AcceleratorState()

    class CustomDataset(Dataset):
        def __init__(self, data):
            self.data = data

        def __len__(self):
            return len(self.data)

        def __getitem__(self, index):
            return self.data[index]

    class CustomBatchSampler:
        def __init__(self, dataset_length: int, batch_size: int, shuffle: bool = True):
            self.batch_size = batch_size
            self.data_index = np.arange(dataset_length)
            self.shuffle = shuffle

        def __iter__(self):
            num_batches = len(self)
            if self.shuffle:
                index = np.random.permutation(self.data_index)
            else:
                index = self.data_index
            output = np.array_split(index, num_batches)
            yield from output

        def __len__(self):
            return math.ceil(len(self.data_index) / self.batch_size)

    dataset = CustomDataset(range(32 * state.num_processes))
    sampler = CustomBatchSampler(len(dataset), batch_size=8)
    dl = DataLoader(dataset, batch_sampler=sampler)
    dl = prepare_data_loader(dl, state.device, state.num_processes, state.process_index)
    # We need just ensure that `dl.batch_sampler` (or `dl.batch_sampler.batch_sampler` is indeed the old batch sampler
    if hasattr(dl.batch_sampler, "batch_sampler"):
        assert isinstance(dl.batch_sampler.batch_sampler, CustomBatchSampler), (
            "Custom sampler was changed after calling `prepare_data_loader`"
        )
    else:
        assert isinstance(dl.batch_sampler, CustomBatchSampler), (
            "Custom sampler was changed after calling `prepare_data_loader`"
        )


def check_seedable_sampler():
    # Set seed
    set_seed(42)
    train_set = RegressionDataset(length=10, seed=42)
    train_dl = DataLoader(train_set, batch_size=2, shuffle=True)

    config = DataLoaderConfiguration(use_seedable_sampler=True)
    accelerator = Accelerator(dataloader_config=config)
    train_dl = accelerator.prepare(train_dl)
    original_items = []
    for _ in range(3):
        for batch in train_dl:
            original_items.append(batch["x"])
    original_items = torch.cat(original_items)

    # Set seed again and the epoch
    set_seed(42)
    train_dl.set_epoch(0)
    new_items = []
    for _ in range(3):
        for batch in train_dl:
            new_items.append(batch["x"])
    new_items = torch.cat(new_items)
    assert torch.allclose(original_items, new_items), "Did not obtain the same items with the same seed and epoch."


def check_seedable_sampler_in_batch_sampler_shard():
    set_seed(42)

    config = DataLoaderConfiguration(use_seedable_sampler=True)
    accelerator = Accelerator(dataloader_config=config)
    assert accelerator.num_processes > 1, "This test requires more than one process."

    dataloader = DataLoader(list(range(10)), batch_size=1, shuffle=True)
    prepared_data_loader = prepare_data_loader(
        dataloader=dataloader,
        use_seedable_sampler=True,
    )

    target_sampler = prepared_data_loader.batch_sampler.batch_sampler.sampler
    assert isinstance(target_sampler, SeedableRandomSampler), (
        "Sampler in BatchSamplerShard is not SeedableRandomSampler."
    )


def check_seedable_sampler_with_data_seed():
    # Set seed
    set_seed(42)
    data_seed = 42
    train_set = RegressionDataset(length=10, seed=42)
    train_dl = DataLoader(train_set, batch_size=2, shuffle=True)

    config = DataLoaderConfiguration(use_seedable_sampler=True, data_seed=data_seed)
    accelerator = Accelerator(dataloader_config=config)
    prepared_dl = accelerator.prepare(train_dl)
    original_items = []
    for _ in range(3):
        for batch in prepared_dl:
            original_items.append(batch["x"])
    original_items = torch.cat(original_items)

    # Set new data seed
    config.data_seed = 43
    accelerator = Accelerator(dataloader_config=config)
    prepared_dl = accelerator.prepare(train_dl)
    new_items = []
    for _ in range(3):
        for batch in prepared_dl:
            new_items.append(batch["x"])
    new_items = torch.cat(new_items)
    assert not torch.allclose(original_items, new_items), "Obtained the same items with different data seed."


def mock_training(length, batch_size, generator, use_seedable_sampler=False):
    set_seed(42)
    generator.manual_seed(42)
    train_set = RegressionDataset(length=length, seed=42)

    train_dl = generate_baseline_dataloader(train_set, generator, batch_size, use_seedable_sampler)
    model = RegressionModel()
    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
    for epoch in range(3):
        for batch in train_dl:
            model.zero_grad()
            output = model(batch["x"])
            loss = torch.nn.functional.mse_loss(output, batch["y"])
            loss.backward()
            optimizer.step()
    return train_set, model


def training_check(use_seedable_sampler=False):
    state = AcceleratorState()
    generator = torch.Generator()
    batch_size = 8
    length = batch_size * 4 * state.num_processes

    train_set, old_model = mock_training(length, batch_size * state.num_processes, generator, use_seedable_sampler)
    assert are_the_same_tensors(old_model.a), "Did not obtain the same model on both processes."
    assert are_the_same_tensors(old_model.b), "Did not obtain the same model on both processes."

    accelerator = Accelerator()
    train_dl = generate_baseline_dataloader(train_set, generator, batch_size, use_seedable_sampler)
    model = RegressionModel()
    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

    train_dl, model, optimizer = accelerator.prepare(train_dl, model, optimizer)
    set_seed(42)
    generator.manual_seed(42)
    for _ in range(3):
        for batch in train_dl:
            model.zero_grad()
            output = model(batch["x"])
            loss = torch.nn.functional.mse_loss(output, batch["y"])
            accelerator.backward(loss)
            optimizer.step()

    model = accelerator.unwrap_model(model).cpu()
    torch.testing.assert_close(
        old_model.a,
        model.a,
        atol=ATOL,
        rtol=RTOL,
        msg=lambda msg: f"Did not obtain the same model on CPU or distributed training.\n{msg}",
    )
    torch.testing.assert_close(
        old_model.b,
        model.b,
        atol=ATOL,
        rtol=RTOL,
        msg=lambda msg: f"Did not obtain the same model on CPU or distributed training.\n{msg}",
    )

    accelerator.print("Training yielded the same results on one CPU or distributed setup with no batch split.")

    dataloader_config = DataLoaderConfiguration(split_batches=True, use_seedable_sampler=use_seedable_sampler)
    accelerator = Accelerator(dataloader_config=dataloader_config)
    train_dl = generate_baseline_dataloader(
        train_set, generator, batch_size * state.num_processes, use_seedable_sampler
    )
    model = RegressionModel()
    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

    train_dl, model, optimizer = accelerator.prepare(train_dl, model, optimizer)
    set_seed(42)
    generator.manual_seed(42)
    for _ in range(3):
        for batch in train_dl:
            model.zero_grad()
            output = model(batch["x"])
            loss = torch.nn.functional.mse_loss(output, batch["y"])
            accelerator.backward(loss)
            optimizer.step()

    model = accelerator.unwrap_model(model).cpu()
    torch.testing.assert_close(
        old_model.a,
        model.a,
        atol=ATOL,
        rtol=RTOL,
        msg=lambda msg: f"Did not obtain the same model on CPU or distributed training.\n{msg}",
    )
    torch.testing.assert_close(
        old_model.b,
        model.b,
        atol=ATOL,
        rtol=RTOL,
        msg=lambda msg: f"Did not obtain the same model on CPU or distributed training.\n{msg}",
    )

    accelerator.print("Training yielded the same results on one CPU or distributed setup with batch split.")

    # FP32 wrapper check
    if is_cuda_available() or is_mps_available():
        # Mostly a test that model.forward will have autocast when running unwrap_model(model, keep_fp32_wrapper=True)
        print("Keep fp32 wrapper check.")
        AcceleratorState._reset_state()
        accelerator = Accelerator(mixed_precision="fp16")

        model = torch.nn.Linear(2, 4)
        model = accelerator.prepare(model)
        model_with_fp32_wrapper = accelerator.unwrap_model(model, keep_fp32_wrapper=True)

        # Run forward with fp16 as input.
        # When the model is with mixed precision wrapper, no error will be raised.
        input_tensor = torch.Tensor([1, 2]).to(dtype=torch.float16, device=accelerator.device)
        output = model_with_fp32_wrapper(input_tensor)

    # BF16 support
    if is_bf16_available():
        # Mostly a test that BF16 doesn't crash as the operation inside the model is not converted to BF16
        print("BF16 training check.")
        AcceleratorState._reset_state()
        dataloader_config = DataLoaderConfiguration(use_seedable_sampler=use_seedable_sampler)
        accelerator = Accelerator(mixed_precision="bf16", dataloader_config=dataloader_config)
        train_dl = generate_baseline_dataloader(train_set, generator, batch_size, use_seedable_sampler)
        model = RegressionModel()
        optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

        train_dl, model, optimizer = accelerator.prepare(train_dl, model, optimizer)
        set_seed(42)
        generator.manual_seed(42)
        for _ in range(3):
            for batch in train_dl:
                model.zero_grad()
                output = model(batch["x"])
                loss = torch.nn.functional.mse_loss(output, batch["y"])
                accelerator.backward(loss)
                optimizer.step()

        model = accelerator.unwrap_model(model).cpu()
        torch.testing.assert_close(
            old_model.a,
            model.a,
            atol=ATOL,
            rtol=RTOL,
            msg=lambda msg: f"Did not obtain the same model on CPU or distributed training.\n{msg}",
        )
        torch.testing.assert_close(
            old_model.b,
            model.b,
            atol=ATOL,
            rtol=RTOL,
            msg=lambda msg: f"Did not obtain the same model on CPU or distributed training.\n{msg}",
        )

    # FP16 support (HPU fp16 model seems to be off by 10% from the CPU, which is a lot of numerical error)
    if is_fp16_available() and not is_hpu_available():
        # Mostly a test that FP16 doesn't crash as the operation inside the model is not converted to FP16
        print("FP16 training check.")
        AcceleratorState._reset_state()
        dataloader_config = DataLoaderConfiguration(use_seedable_sampler=use_seedable_sampler)
        accelerator = Accelerator(mixed_precision="fp16", dataloader_config=dataloader_config)
        train_dl = generate_baseline_dataloader(train_set, generator, batch_size, use_seedable_sampler)
        model = RegressionModel()
        optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

        train_dl, model, optimizer = accelerator.prepare(train_dl, model, optimizer)
        set_seed(42)
        generator.manual_seed(42)
        for _ in range(3):
            for batch in train_dl:
                model.zero_grad()
                output = model(batch["x"])
                loss = torch.nn.functional.mse_loss(output, batch["y"])
                accelerator.backward(loss)
                optimizer.step()

        model = accelerator.unwrap_model(model).cpu()
        torch.testing.assert_close(
            old_model.a,
            model.a,
            atol=ATOL,
            rtol=RTOL,
            msg=lambda msg: f"Did not obtain the same model on CPU or distributed training.\n{msg}",
        )
        torch.testing.assert_close(
            old_model.b,
            model.b,
            atol=ATOL,
            rtol=RTOL,
            msg=lambda msg: f"Did not obtain the same model on CPU or distributed training.\n{msg}",
        )


def test_split_between_processes_dataset(datasets_Dataset):
    state = AcceleratorState()
    data = datasets_Dataset.from_list([dict(k=v) for v in range(2 * state.num_processes)])
    with state.split_between_processes(data, apply_padding=False) as results:
        assert len(results) == 2, (
            f"Each process did not have two items. Process index: {state.process_index}; Length: {len(results)}"
        )

    data = datasets_Dataset.from_list([dict(k=v) for v in range(2 * state.num_processes - 1)])
    with state.split_between_processes(data, apply_padding=False) as results:
        if state.is_last_process:
            assert len(results) == 1, (
                f"Last process did not receive a single item. Process index: {state.process_index}; Length: {len(results)}"
            )
        else:
            assert len(results) == 2, (
                f"One of the intermediate processes did not receive two items. Process index: {state.process_index}; Length: {len(results)}"
            )
    state.wait_for_everyone()

    odd_data = datasets_Dataset.from_list([dict(k=v) for v in range(2 * state.num_processes - 1)])
    even_data = datasets_Dataset.from_list([dict(k=v) for v in range(2 * state.num_processes)])

    for data in [odd_data, even_data]:
        expected_output = data["k"]

        with state.split_between_processes(data, apply_padding=True) as results:
            if state.num_processes == 1:
                assert len(results) == len(data), (
                    f"Single process did not receive all items. Process index: {state.process_index}; Length: {len(results)}"
                )
            else:
                assert len(results) == 2, (
                    f"Each process did not have two items. Process index: {state.process_index}; Length: {len(results)}"
                )

            results_per_process = []
            for result in results:
                results_per_process.append(result)

        state.wait_for_everyone()

        gathered_results = gather_object(results_per_process)
        output = [r["k"] for r in gathered_results[: len(data)]]

        assert expected_output == output, f"Gathered results is incorrect. Expected: {expected_output}; Got: {output}"


def test_split_between_processes_list():
    state = AcceleratorState()
    data = list(range(0, 2 * state.num_processes))
    with state.split_between_processes(data) as results:
        assert len(results) == 2, (
            f"Each process did not have two items. Process index: {state.process_index}; Length: {len(results)}"
        )
    state.wait_for_everyone()

    even_data = list(range(0, (2 * state.num_processes)))
    odd_data = list(range(0, (2 * state.num_processes) - 1))
    for data in [odd_data, even_data]:
        expected_output = data

        with state.split_between_processes(data, apply_padding=True) as results:
            num_samples_per_device = math.ceil(len(data) / state.num_processes)
            # Test all processes gets the correct number of item(s)
            assert len(results) == num_samples_per_device, (
                f"Process {state.device} did not get the correct number of item(s). Process index: {state.process_index}; Length: {len(results)}"
            )

            results_per_process = []
            for result in results:
                results_per_process.append(result)

        state.wait_for_everyone()

        gathered_results = gather_object(results_per_process)
        output = gathered_results[: len(data)]

        assert expected_output == output, f"Gathered results is incorrect. Expected: {expected_output}; Got: {output}"


def test_split_between_processes_nested_dict():
    state = AcceleratorState()
    a = [1, 2, 3, 4, 5, 6, 7, 8]
    b = ["a", "b", "c", "d", "e", "f", "g", "h"]
    c = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8])
    if state.num_processes in (1, 2, 4):
        data = {"a": a, "b": b, "c": c}
        data_copy = deepcopy(data)
        with state.split_between_processes(data) as results:
            if state.process_index == 0:
                assert results["a"] == data_copy["a"][: 8 // state.num_processes]
            elif state.num_processes == 2:
                assert results["a"] == data_copy["a"][4:]
            elif state.process_index == 3:
                # We return a list each time
                assert results["a"] == data_copy["a"][-2:], f"Expected: {data_copy['a'][-2]}, Actual: {results['a']}"
            if state.process_index == 0:
                assert results["b"] == data_copy["b"][: 8 // state.num_processes]
            elif state.num_processes == 2:
                assert results["b"] == data_copy["b"][4:]
            elif state.process_index == 3:
                assert results["b"] == data_copy["b"][-2:]
            if state.process_index == 0:
                assert torch.allclose(results["c"], data_copy["c"][: 8 // state.num_processes]), (
                    f"Did not obtain expected values on process 0, expected `{data['c'][: 8 // state.num_processes]}`, received: {results['c']}"
                )
            elif state.num_processes == 2:
                assert torch.allclose(results["c"], data_copy["c"][4:]), (
                    f"Did not obtain expected values on process 2, expected `{data['c'][4:]}`, received: {results['c']}"
                )
            elif state.process_index == 3:
                assert torch.allclose(results["c"], data_copy["c"][-2:]), (
                    f"Did not obtain expected values on process 4, expected `{data['c'][-2:]}`, received: {results['c']}"
                )

    state.wait_for_everyone()


def test_split_between_processes_tensor():
    state = AcceleratorState()
    if state.num_processes > 1:
        data = torch.tensor([[0, 1, 2, 3], [4, 5, 6, 7]]).to(state.device)
        with state.split_between_processes(data) as results:
            if state.process_index == 0:
                expected = torch.tensor([[0, 1, 2, 3]]).to(state.device)
            else:
                expected = torch.tensor([[4, 5, 6, 7]]).to(state.device)
            torch.testing.assert_close(results, expected)
        state.wait_for_everyone()

    even_data = torch.tensor([[i] for i in range(2 * state.num_processes)]).to(state.device)
    odd_data = torch.tensor([[i] for i in range(2 * state.num_processes - 1)]).to(state.device)
    for data in [even_data, odd_data]:
        expected_output = [torch.tensor(i) for i in data.tolist()]

        with state.split_between_processes(data, apply_padding=True) as results:
            num_samples_per_device = math.ceil(len(data) / state.num_processes)
            assert len(results) == num_samples_per_device, (
                f"Process {state.device} did not get the correct number of item(s). Process index: {state.process_index}; Length: {len(results)}"
            )
            results_per_process = []
            for result in results:
                results_per_process.append(result.to("cpu"))

        state.wait_for_everyone()

        gathered_results = gather_object(results_per_process)
        output = gathered_results[: len(data)]

        assert expected_output == output, f"Gathered results is incorrect. Expected: {expected_output}; Got: {output}"


def test_split_between_processes_evenly():
    state = AcceleratorState()
    if state.num_processes in (1, 2, 4, 8):
        data = list(range(17))
        num_samples_per_process = len(data) // state.num_processes
        num_extras = len(data) % state.num_processes
        with state.split_between_processes(data) as results:
            if state.process_index < num_extras:
                assert len(results) == num_samples_per_process + 1, (
                    f"Each Process should have even elements. Expected: {num_samples_per_process + 1}, Actual: {len(results)}"
                )
            else:
                assert len(results) == num_samples_per_process, (
                    f"Each Process should have even elements. Expected: {num_samples_per_process}, Actual: {len(results)}"
                )
    state.wait_for_everyone()


def test_trigger():
    accelerator = Accelerator()
    # should start with being false
    assert accelerator.check_trigger() is False

    # set a breakpoint on the main process
    if accelerator.is_main_process:
        accelerator.set_trigger()

    # check it's been activated across all processes
    # calls `all_reduce` and triggers a sync
    assert accelerator.check_trigger() is True

    # check it's been reset after the sync
    assert accelerator.check_trigger() is False


def test_reinstantiated_state():
    import pytest

    AcceleratorState._reset_state()
    simple_model = torch.nn.Linear(1, 1)
    # First define an accelerator
    accelerator = Accelerator()
    # Then call `reset_state`, breaking the state existing in the accelerator
    AcceleratorState._reset_state()
    # Now try and prepare a simple model, should raise the custom error early
    with pytest.raises(AttributeError) as cm:
        accelerator.prepare(simple_model)
    assert "`AcceleratorState` object has no attribute" in str(cm.value.args[0])
    assert "This happens if `AcceleratorState._reset_state()`" in str(cm.value.args[0])


def main():
    accelerator = Accelerator()
    state = accelerator.state
    if state.local_process_index == 0:
        print("**Initialization**")
    init_state_check()
    state.wait_for_everyone()

    if state.distributed_type == DistributedType.MULTI_GPU:
        num_processes_per_node = torch.cuda.device_count()
    else:
        num_processes_per_node = state.num_processes

    # We only run this test on non-multinode
    if num_processes_per_node == state.num_processes:
        if state.process_index == 0:
            print("\n**Test process execution**")
        process_execution_check()

        if state.process_index == 0:
            print("\n**Test split between processes as a list**")
        test_split_between_processes_list()

        if state.process_index == 0:
            print("\n**Test split between processes as a dict**")
        test_split_between_processes_nested_dict()

        if state.process_index == 0:
            print("\n**Test split between processes as a tensor**")
        test_split_between_processes_tensor()

        if state.process_index == 0:
            print("\n**Test split between processes evenly**")
        test_split_between_processes_evenly()

        if state.process_index == 0:
            print("\n**Test split between processes as a datasets.Dataset**")
        if is_datasets_available():
            from datasets import Dataset as datasets_Dataset

            test_split_between_processes_dataset(datasets_Dataset)
        else:
            print("Skipped because Hugging Face datasets is not available")

    if state.local_process_index == 0:
        print("\n**Test random number generator synchronization**")
    rng_sync_check()

    if state.local_process_index == 0:
        print("\n**DataLoader integration test**")
    dl_preparation_check()
    if state.distributed_type != DistributedType.XLA:
        central_dl_preparation_check()
        custom_sampler_check()
        check_seedable_sampler()
        check_seedable_sampler_with_data_seed()

    if state.num_processes > 1:
        check_seedable_sampler_in_batch_sampler_shard()

    # Trainings are not exactly the same in DeepSpeed and CPU mode
    if state.distributed_type == DistributedType.DEEPSPEED:
        return

    if state.local_process_index == 0:
        print("\n**Training integration test**")
    training_check(use_seedable_sampler=False)
    training_check(use_seedable_sampler=True)

    if state.local_process_index == 0:
        print("\n**Breakpoint trigger test**")
    test_trigger()

    if is_pytest_available():
        if state.local_process_index == 0:
            print("\n**Test reinstantiated state**")
        test_reinstantiated_state()

    state.destroy_process_group()


if __name__ == "__main__":
    main()


================================================
FILE: src/accelerate/test_utils/scripts/test_sync.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from copy import deepcopy

import torch
import torch.nn.functional as F
from torch.optim import AdamW
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data import DataLoader

from accelerate.accelerator import Accelerator, DataLoaderConfiguration, GradientAccumulationPlugin
from accelerate.state import GradientState
from accelerate.test_utils import RegressionDataset, RegressionModel
from accelerate.utils import DistributedType, set_seed


def check_model_parameters(model_a, model_b, did_step, iteration, **kwargs):
    for param, grad_param in zip(model_a.parameters(), model_b.parameters()):
        if not param.requires_grad:
            continue
        if not did_step:
            # Grads should not be in sync
            assert torch.allclose(param.grad, grad_param.grad, **kwargs) is False, (
                f"Gradients in sync when they should not be at iteration {iteration}:\nmodel_a grad ({param.grad}) == model_b grad ({grad_param.grad})"
            )
        else:
            # Grads should be in sync
            assert torch.allclose(param.grad, grad_param.grad, **kwargs) is True, (
                f"Gradients not in sync when they should be at iteration {iteration}:\nmodel_a grad ({param.grad}) != model_b grad ({grad_param.grad})"
            )


def step_model(model, input, target, accelerator, do_backward=True):
    model.train()
    output = model(input)
    loss = F.mse_loss(output, target.to(output.device))
    if not do_backward:
        loss /= accelerator.gradient_accumulation_steps
        loss.backward()
    else:
        accelerator.backward(loss)


def get_training_setup(accelerator, sched=False):
    "Returns everything needed to perform basic training"
    set_seed(42)
    model = RegressionModel()
    ddp_model = deepcopy(model)
    dset = RegressionDataset(length=80)
    dataloader = DataLoader(dset, batch_size=16)
    model.to(accelerator.device)
    if sched:
        opt = AdamW(params=model.parameters(), lr=1e-3)
        ddp_opt = AdamW(params=ddp_model.parameters(), lr=1e-3)
        sched = LambdaLR(opt, lr_lambda=lambda epoch: epoch**0.65)
        ddp_sched = LambdaLR(ddp_opt, lr_lambda=lambda epoch: epoch**0.65)
    # Make a copy of `model`
    if sched:
        ddp_model, ddp_opt, ddp_sched, dataloader = accelerator.prepare(ddp_model, ddp_opt, ddp_sched, dataloader)
    else:
        ddp_model, dataloader = accelerator.prepare(ddp_model, dataloader)
    if sched:
        return (model, opt, sched, dataloader, ddp_model, ddp_opt, ddp_sched)
    return model, ddp_model, dataloader


def test_noop_sync(accelerator):
    # Test when on a single CPU or GPU that the context manager does nothing
    model, ddp_model, dataloader = get_training_setup(accelerator)
    # Use a single batch
    ddp_input, ddp_target = next(iter(dataloader)).values()
    for iteration in range(3):
        # Gather the distributed inputs and targs for the base model
        input, target = accelerator.gather((ddp_input, ddp_target))
        input, target = input.to(accelerator.device), target.to(accelerator.device)
        # Perform our initial ground truth step in non "DDP"
        step_model(model, input, target, accelerator)
        # Do "gradient accumulation" (noop)
        if iteration % 2 == 0:
            # Accumulate grads locally
            with accelerator.no_sync(ddp_model):
                step_model(ddp_model, ddp_input, ddp_target, accelerator)
        else:
            # Sync grads
            step_model(ddp_model, ddp_input, ddp_target, accelerator)

        # Since `no_sync` is a noop, `ddp_model` and `model` grads should always be in sync
        check_model_parameters(model, ddp_model, True, iteration)
        for param, ddp_param in zip(model.parameters(), ddp_model.parameters()):
            if not param.requires_grad:
                continue
            assert torch.allclose(param.grad, ddp_param.grad), (
                f"Gradients not in sync when they should be:\nModel grad ({param.grad}) != DDP grad ({ddp_param.grad})"
            )

        # Shuffle ddp_input on each iteration
        torch.manual_seed(1337 + iteration)
        ddp_input = ddp_input[torch.randperm(len(ddp_input))]


def test_distributed_sync(accelerator):
    # Test on distributed setup that context manager behaves properly
    model, ddp_model, dataloader = get_training_setup(accelerator)
    # Use a single batch
    ddp_input, ddp_target = next(iter(dataloader)).values()
    for iteration in range(3):
        # Gather the distributed inputs and targs for the base model
        input, target = accelerator.gather((ddp_input, ddp_target))
        input, target = input.to(accelerator.device), target.to(accelerator.device)
        # Perform our initial ground truth step in non "DDP"
        step_model(model, input, target, accelerator)
        # Do "gradient accumulation" (noop)
        if iteration % 2 == 0:
            # Accumulate grads locally
            with accelerator.no_sync(ddp_model):
                step_model(ddp_model, ddp_input, ddp_target, accelerator)
        else:
            # Sync grads
            step_model(ddp_model, ddp_input, ddp_target, accelerator)

        # DDP model and model should only be in sync when not (iteration % 2 == 0)
        for param, ddp_param in zip(model.parameters(), ddp_model.parameters()):
            if not param.requires_grad:
                continue
            if iteration % 2 == 0:
                # Grads should not be in sync
                assert torch.allclose(param.grad, ddp_param.grad) is False, (
                    f"Gradients in sync when they should not be:\nModel grad ({param.grad}) == DDP grad ({ddp_param.grad})"
                )
            else:
                # Grads should be in sync
                assert torch.allclose(param.grad, ddp_param.grad) is True, (
                    f"Gradients not in sync when they should be:\nModel grad ({param.grad}) != DDP grad ({ddp_param.grad})"
                )

        # Shuffle ddp_input on each iteration
        torch.manual_seed(1337 + iteration)
        ddp_input = ddp_input[torch.randperm(len(ddp_input))]


def test_distributed_sync_multiple_fwd(accelerator):
    # Test on distributed setup that context manager behaves properly when used with multiple forwards followed by multiple backwards
    model, ddp_model, dataloader = get_training_setup(accelerator)
    # Do multiple forwards
    losses = []
    num_iterations = 3
    for iteration in range(num_iterations):
        ddp_input, ddp_target = next(iter(dataloader)).values()

        # Gather the distributed inputs and targs for the base model
        input, target = accelerator.gather((ddp_input, ddp_target))
        input, target = input.to(accelerator.device), target.to(accelerator.device)

        # Perform our initial ground truth step in non "DDP"
        step_model(model, input, target, accelerator)

        # Accumulate grads locally
        with accelerator.no_sync(ddp_model):
            ddp_output = ddp_model(ddp_input)
            loss = F.mse_loss(ddp_output, ddp_target.to(ddp_output.device))
            losses.append(loss)

    # Do multiple backwards and sync only at the last backward
    for iteration in range(num_iterations):
        loss = losses[iteration]

        if iteration < num_iterations - 1:
            # Accumulate grads locally
            accelerator.backward(loss)

            # DDP model and model should only be in sync after last backward
            for param, ddp_param in zip(model.parameters(), ddp_model.parameters()):
                if not param.requires_grad:
                    continue
                # Grads should not be in sync
                assert torch.allclose(param.grad, ddp_param.grad) is False, (
                    f"Gradients in sync when they should not be:\nModel grad ({param.grad}) == DDP grad ({ddp_param.grad})"
                )

        else:
            # Sync grads if last backward
            with accelerator.trigger_sync_in_backward(ddp_model):
                accelerator.backward(loss)

            # DDP model and model should only be in sync after last backward
            for param, ddp_param in zip(model.parameters(), ddp_model.parameters()):
                if not param.requires_grad:
                    continue
                # Grads should be in sync
                assert torch.allclose(param.grad, ddp_param.grad) is True, (
                    f"Gradients not in sync when they should be:\nModel grad ({param.grad}) != DDP grad ({ddp_param.grad})"
                )


def test_gradient_accumulation(split_batches=False, dispatch_batches=False, sync_each_batch=False):
    gradient_accumulation_plugin = GradientAccumulationPlugin(num_steps=2, sync_each_batch=sync_each_batch)
    dataloader_config = DataLoaderConfiguration(split_batches=split_batches, dispatch_batches=dispatch_batches)
    accelerator = Accelerator(
        dataloader_config=dataloader_config,
        gradient_accumulation_plugin=gradient_accumulation_plugin,
    )
    # Test that context manager behaves properly
    model, ddp_model, dataloader = get_training_setup(accelerator)
    for iteration, batch in enumerate(dataloader):
        ddp_input, ddp_target = batch.values()
        # Gather the distributed inputs and targs for the base model
        input, target = accelerator.gather((ddp_input, ddp_target))
        input, target = input.to(accelerator.device), target.to(accelerator.device)
        # Perform our initial ground truth step in non "DDP"
        step_model(model, input, target, accelerator, False)
        # Do "gradient accumulation" (noop)
        with accelerator.accumulate(ddp_model):
            step_model(ddp_model, ddp_input, ddp_target, accelerator)

        # DDP model and model should only be in sync when not (iteration % 2 == 0)
        for param, ddp_param in zip(model.parameters(), ddp_model.parameters()):
            if not param.requires_grad:
                continue
            if ((iteration + 1) % 2 == 0) or (iteration == len(dataloader) - 1) or sync_each_batch:
                # Grads should be in sync
                assert torch.allclose(param.grad, ddp_param.grad) is True, (
                    f"Gradients not in sync when they should be at iteration {iteration}:\nModel grad ({param.grad}) != DDP grad ({ddp_param.grad})"
                )
            else:
                # Grads should not be in sync
                assert torch.allclose(param.grad, ddp_param.grad) is False, (
                    f"Gradients in sync when they should not be at iteration {iteration}:\nModel grad ({param.grad}) == DDP grad ({ddp_param.grad})"
                )

        # Shuffle ddp_input on each iteration
        torch.manual_seed(1337 + iteration)
        ddp_input = ddp_input[torch.randperm(len(ddp_input))]
    GradientState._reset_state()


def test_gradient_accumulation_with_opt_and_scheduler(
    split_batches=False, dispatch_batches=False, sync_each_batch=False
):
    gradient_accumulation_plugin = GradientAccumulationPlugin(num_steps=2, sync_each_batch=sync_each_batch)
    dataloader_config = DataLoaderConfiguration(split_batches=split_batches, dispatch_batches=dispatch_batches)
    accelerator = Accelerator(
        dataloader_config=dataloader_config,
        gradient_accumulation_plugin=gradient_accumulation_plugin,
    )
    # Test that context manager behaves properly
    model, opt, sched, dataloader, ddp_model, ddp_opt, ddp_sched = get_training_setup(accelerator, True)
    for iteration, batch in enumerate(dataloader):
        ddp_input, ddp_target = batch.values()
        # Gather the distributed inputs and targs for the base model
        input, target = accelerator.gather((ddp_input, ddp_target))
        input, target = input.to(accelerator.device), target.to(accelerator.device)
        # Perform our initial ground truth step in non "DDP"
        model.train()
        ddp_model.train()
        step_model(model, input, target, accelerator, False)
        opt.step()

        if ((iteration + 1) % 2 == 0) or ((iteration + 1) == len(dataloader)):
            if split_batches:
                sched.step()
            else:
                for _ in range(accelerator.num_processes):
                    sched.step()

        # Perform gradient accumulation under wrapper
        with accelerator.accumulate(ddp_model):
            step_model(ddp_model, ddp_input, ddp_target, accelerator)
            ddp_opt.step()
            ddp_sched.step()

        # Learning rates should be the same
        assert opt.param_groups[0]["lr"] == ddp_opt.param_groups[0]["lr"], (
            f"Learning rates found in each optimizer did not align\nopt: {opt.param_groups[0]['lr']}\nDDP opt: {ddp_opt.param_groups[0]['lr']}\n"
        )
        did_step = (((iteration + 1) % 2) == 0) or ((iteration + 1) == len(dataloader))
        if accelerator.num_processes > 1:
            check_model_parameters(
                model,
                ddp_model,
                did_step or sync_each_batch,  # syncs at each grad_accum interval of if sync_each_batch==True
                iteration,
                rtol=1e-3,  # needs a relative tolerance due to roundoff errors
            )

        if did_step:
            opt.zero_grad()  # flush gradients every accum step
        ddp_opt.zero_grad()

        # Shuffle ddp_input on each iteration
        torch.manual_seed(1337 + iteration)
    GradientState._reset_state()


def test_dataloader_break():
    accelerator = Accelerator()
    first_dset = RegressionDataset(length=80)
    first_dataloader = DataLoader(first_dset, batch_size=16)
    second_dset = RegressionDataset(length=96)
    second_dataloader = DataLoader(second_dset, batch_size=16)
    first_dataloader, second_dataloader = accelerator.prepare(first_dataloader, second_dataloader)

    assert accelerator.gradient_state.active_dataloader is None
    for iteration, _ in enumerate(first_dataloader):
        assert id(accelerator.gradient_state.active_dataloader) == id(first_dataloader)
        if iteration < len(first_dataloader) - 1:
            assert not accelerator.gradient_state.end_of_dataloader
            if iteration == 1:
                for batch_num, _ in enumerate(second_dataloader):
                    assert id(accelerator.gradient_state.active_dataloader) == id(second_dataloader)
                    if batch_num < len(second_dataloader) - 1:
                        assert not accelerator.gradient_state.end_of_dataloader
                    else:
                        assert accelerator.gradient_state.end_of_dataloader
        else:
            assert accelerator.gradient_state.end_of_dataloader
    assert accelerator.gradient_state.active_dataloader is None


def main():
    accelerator = Accelerator()
    state = accelerator.state
    if state.local_process_index == 0:
        print("**Test `accumulate` gradient accumulation with dataloader break**")
    if state.distributed_type != DistributedType.XLA:
        test_dataloader_break()
    if state.distributed_type == DistributedType.NO:
        if state.local_process_index == 0:
            print("**Test NOOP `no_sync` context manager**")
        test_noop_sync(accelerator)
    if state.distributed_type in (
        DistributedType.MULTI_GPU,
        DistributedType.MULTI_NPU,
        DistributedType.MULTI_MLU,
        DistributedType.MULTI_SDAA,
        DistributedType.MULTI_MUSA,
        DistributedType.MULTI_CPU,
        DistributedType.MULTI_HPU,
        DistributedType.MULTI_NEURON,
    ):
        if state.local_process_index == 0:
            print("**Test Distributed `no_sync` context manager**")
        test_distributed_sync(accelerator)
        if state.local_process_index == 0:
            print("**Test Distributed `no_sync` context manager with multiple forwards**")
        test_distributed_sync_multiple_fwd(accelerator)
    if state.distributed_type in (
        DistributedType.MULTI_GPU,
        DistributedType.MULTI_NPU,
        DistributedType.MULTI_MLU,
        DistributedType.MULTI_SDAA,
        DistributedType.MULTI_MUSA,
        DistributedType.MULTI_HPU,
        DistributedType.MULTI_NEURON,
    ):
        for split_batch in [True, False]:
            for dispatch_batches in [True, False]:
                for sync_each_batch in [True, False]:
                    if state.local_process_index == 0:
                        print(
                            "**Test `accumulate` gradient accumulation, ",
                            f"`split_batches={split_batch}` and `dispatch_batches={dispatch_batches}` and `sync_each_batch={sync_each_batch}`**",
                        )
                    test_gradient_accumulation(split_batch, dispatch_batches, sync_each_batch)

    # Currently will break on torch 2.0 +, need to investigate why
    if state.local_process_index == 0:
        print(
            "**Test `accumulate` gradient accumulation with optimizer and scheduler, ",
            "`split_batches=False`, `dispatch_batches=False`, `sync_each_batch=False`**",
        )
    test_gradient_accumulation_with_opt_and_scheduler()
    if state.distributed_type in (
        DistributedType.MULTI_GPU,
        DistributedType.MULTI_NPU,
        DistributedType.MULTI_MLU,
        DistributedType.MULTI_SDAA,
        DistributedType.MULTI_MUSA,
        DistributedType.MULTI_HPU,
        DistributedType.MULTI_NEURON,
    ):
        for split_batch in [True, False]:
            for dispatch_batches in [True, False]:
                for sync_each_batch in [True, False]:
                    if not split_batch and not dispatch_batches and not sync_each_batch:
                        continue
                    if state.local_process_index == 0:
                        print(
                            "**Test `accumulate` gradient accumulation with optimizer and scheduler, ",
                            f"`split_batches={split_batch}` and `dispatch_batches={dispatch_batches}` and `sync_each_batch={sync_each_batch}`**",
                        )
                    test_gradient_accumulation_with_opt_and_scheduler(split_batch, dispatch_batches, sync_each_batch)
    state.destroy_process_group()


def _mp_fn(index):
    # For xla_spawn (TPUs)
    main()


if __name__ == "__main__":
    main()


================================================
FILE: src/accelerate/test_utils/testing.py
================================================
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import asyncio
import inspect
import io
import os
import re
import shutil
import subprocess
import sys
import tempfile
import unittest
from contextlib import contextmanager
from functools import partial
from pathlib import Path
from typing import Optional, Union
from unittest import mock

import torch

import accelerate

from ..state import AcceleratorState
from ..utils import (
    check_cuda_fp8_capability,
    compare_versions,
    gather,
    is_aim_available,
    is_bnb_available,
    is_clearml_available,
    is_comet_ml_available,
    is_cuda_available,
    is_datasets_available,
    is_deepspeed_available,
    is_dvclive_available,
    is_fp8_available,
    is_fp16_available,
    is_habana_gaudi1,
    is_hpu_available,
    is_import_timer_available,
    is_matplotlib_available,
    is_mlflow_available,
    is_mlu_available,
    is_mps_available,
    is_musa_available,
    is_neuron_available,
    is_npu_available,
    is_pandas_available,
    is_pippy_available,
    is_pytest_available,
    is_schedulefree_available,
    is_sdaa_available,
    is_swanlab_available,
    is_tensorboard_available,
    is_timm_available,
    is_torch_version,
    is_torch_xla_available,
    is_torchao_available,
    is_torchdata_stateful_dataloader_available,
    is_torchvision_available,
    is_trackio_available,
    is_transformer_engine_available,
    is_transformer_engine_mxfp8_available,
    is_transformers_available,
    is_triton_available,
    is_wandb_available,
    is_xpu_available,
    str_to_bool,
)


def get_backend():
    if is_torch_xla_available():
        return "xla", torch.cuda.device_count(), torch.cuda.memory_allocated
    elif is_cuda_available():
        return "cuda", torch.cuda.device_count(), torch.cuda.memory_allocated
    elif is_mps_available(min_version="2.0"):
        return "mps", 1, torch.mps.current_allocated_memory
    elif is_mps_available():
        return "mps", 1, lambda: 0
    elif is_mlu_available():
        return "mlu", torch.mlu.device_count(), torch.mlu.memory_allocated
    elif is_sdaa_available():
        return "sdaa", torch.sdaa.device_count(), torch.sdaa.memory_allocated
    elif is_musa_available():
        return "musa", torch.musa.device_count(), torch.musa.memory_allocated
    elif is_npu_available():
        return "npu", torch.npu.device_count(), torch.npu.memory_allocated
    elif is_xpu_available():
        return "xpu", torch.xpu.device_count(), torch.xpu.memory_allocated
    elif is_hpu_available():
        return "hpu", torch.hpu.device_count(), torch.hpu.memory_allocated
    elif is_neuron_available():
        return "neuron", torch.neuron.device_count(), torch.neuron.memory_allocated
    else:
        return "cpu", 1, lambda: 0


torch_device, device_count, memory_allocated_func = get_backend()


def get_launch_command(**kwargs) -> list:
    """
    Wraps around `kwargs` to help simplify launching from `subprocess`.

    Example:
    ```python
    # returns ['accelerate', 'launch', '--num_processes=2', '--device_count=2']
    get_launch_command(num_processes=2, device_count=2)
    ```
    """
    command = ["accelerate", "launch"]
    for k, v in kwargs.items():
        if isinstance(v, bool) and v:
            command.append(f"--{k}")
        elif v is not None:
            command.append(f"--{k}={v}")
    return command


DEFAULT_LAUNCH_COMMAND = get_launch_command(num_processes=device_count, monitor_interval=0.1)


def parse_flag_from_env(key, default=False):
    try:
        value = os.environ[key]
    except KeyError:
        # KEY isn't set, default to `default`.
        _value = default
    else:
        # KEY is set, convert it to True or False.
        try:
            _value = str_to_bool(value)
        except ValueError:
            # More values are supported, but let's keep the message simple.
            raise ValueError(f"If set, {key} must be yes or no.")
    return _value


_run_slow_tests = parse_flag_from_env("RUN_SLOW", default=False)


def skip(test_case):
    "Decorator that skips a test unconditionally"
    return unittest.skip("Test was skipped")(test_case)


def slow(test_case):
    """
    Decorator marking a test as slow. Slow tests are skipped by default. Set the RUN_SLOW environment variable to a
    truthy value to run them.
    """
    return unittest.skipUnless(_run_slow_tests, "test is slow")(test_case)


def require_cpu(test_case):
    """
    Decorator marking a test that must be only ran on the CPU. These tests are skipped when a GPU is available.
    """
    return unittest.skipUnless(torch_device == "cpu", "test requires only a CPU")(test_case)


def require_non_cpu(test_case):
    """
    Decorator marking a test that requires a hardware accelerator backend. These tests are skipped when there are no
    hardware accelerator available.
    """
    return unittest.skipUnless(torch_device != "cpu", "test requires a GPU")(test_case)


def require_cuda(test_case):
    """
    Decorator marking a test that requires CUDA. These tests are skipped when there are no GPU available or when
    TorchXLA is available.
    """
    return unittest.skipUnless(is_cuda_available() and not is_torch_xla_available(), "test requires a GPU")(test_case)


def require_cuda_or_hpu(test_case):
    """
    Decorator marking a test that requires CUDA or HPU. These tests are skipped when there are no GPU available or when
    TorchXLA is available.
    """
    return unittest.skipUnless(
        (is_cuda_available() and not is_torch_xla_available()) or is_hpu_available(), "test requires a GPU or HPU"
    )(test_case)


def require_xpu(test_case):
    """
    Decorator marking a test that requires XPU. These tests are skipped when there are no XPU available.
    """
    return unittest.skipUnless(is_xpu_available(), "test requires a XPU")(test_case)


def require_cuda_or_xpu(test_case):
    """
    Decorator marking a test that requires CUDA or XPU. These tests are skipped when there are no GPU available or when
    TorchXLA is available.
    """
    cuda_condition = is_cuda_available() and not is_torch_xla_available()
    xpu_condition = is_xpu_available()
    return unittest.skipUnless(cuda_condition or xpu_condition, "test requires a CUDA GPU or XPU")(test_case)


def require_non_xpu(test_case):
    """
    Decorator marking a test that should be skipped for XPU.
    """
    return unittest.skipUnless(torch_device != "xpu", "test requires a non-XPU")(test_case)


def require_non_hpu(test_case):
    """
    Decorator marking a test that should be skipped for HPU.
    """
    return unittest.skipUnless(torch_device != "hpu", "test requires a non-HPU")(test_case)


def require_fp16(test_case):
    """
    Decorator marking a test that requires FP16. These tests are skipped when FP16 is not supported.
    """

    return unittest.skipUnless(is_fp16_available(), "test requires FP16 support")(test_case)


def require_fp8(test_case):
    """
    Decorator marking a test that requires FP8. These tests are skipped when FP8 is not supported.
    """

    # is_fp8_available only checks for libraries
    # ideally it should check for device capability as well
    fp8_is_available = is_fp8_available()

    if torch.cuda.is_available() and not check_cuda_fp8_capability():
        fp8_is_available = False

    if is_hpu_available() and is_habana_gaudi1():
        fp8_is_available = False

    return unittest.skipUnless(fp8_is_available, "test requires FP8 support")(test_case)


def require_fsdp2(test_case):
    return unittest.skipUnless(is_torch_version(">=", "2.5.0"), "test requires FSDP2 (torch >= 2.5.0)")(test_case)


def require_mlu(test_case):
    """
    Decorator marking a test that requires MLU. These tests are skipped when there are no MLU available.
    """
    return unittest.skipUnless(is_mlu_available(), "test require a MLU")(test_case)


def require_sdaa(test_case):
    """
    Decorator marking a test that requires SDAA. These tests are skipped when there are no SDAA available.
    """
    return unittest.skipUnless(is_sdaa_available(), "test require a SDAA")(test_case)


def require_musa(test_case):
    """
    Decorator marking a test that requires MUSA. These tests are skipped when there are no MUSA available.
    """
    return unittest.skipUnless(is_musa_available(), "test require a MUSA")(test_case)


def require_npu(test_case):
    """
    Decorator marking a test that requires NPU. These tests are skipped when there are no NPU available.
    """
    return unittest.skipUnless(is_npu_available(), "test require a NPU")(test_case)


def require_neuron(test_case):
    """
    Decorator marking a test that requires Neuron. These tests are skipped when there are no Neuron Cores available.
    """
    return unittest.skipUnless(is_neuron_available(), "test require Neuron Cores")(test_case)


def require_mps(test_case):
    """
    Decorator marking a test that requires MPS backend. These tests are skipped when torch doesn't support `mps`
    backend.
    """
    return unittest.skipUnless(is_mps_available(), "test requires a `mps` backend support in `torch`")(test_case)


def require_huggingface_suite(test_case):
    """
    Decorator marking a test that requires transformers and datasets. These tests are skipped when they are not.
    """
    return unittest.skipUnless(
        is_transformers_available() and is_datasets_available(),
        "test requires the Hugging Face suite",
    )(test_case)


def require_datasets(test_case):
    """
    Decorator marking a test that requires datasets. These tests are skipped when they are not.
    """
    return unittest.skipUnless(is_datasets_available(), "test requires the datasets library")(test_case)


def require_transformers(test_case):
    """
    Decorator marking a test that requires transformers. These tests are skipped when they are not.
    """
    return unittest.skipUnless(is_transformers_available(), "test requires the transformers library")(test_case)


def require_timm(test_case):
    """
    Decorator marking a test that requires timm. These tests are skipped when they are not.
    """
    return unittest.skipUnless(is_timm_available(), "test requires the timm library")(test_case)


def require_torchvision(test_case):
    """
    Decorator marking a test that requires torchvision. These tests are skipped when they are not.
    """
    return unittest.skipUnless(is_torchvision_available(), "test requires the torchvision library")(test_case)


def require_triton(test_case):
    """
    Decorator marking a test that requires triton. These tests are skipped when they are not.
    """
    return unittest.skipUnless(is_triton_available(), "test requires the triton library")(test_case)


def require_schedulefree(test_case):
    """
    Decorator marking a test that requires schedulefree. These tests are skipped when they are not.
    """
    return unittest.skipUnless(is_schedulefree_available(), "test requires the schedulefree library")(test_case)


def require_bnb(test_case):
    """
    Decorator marking a test that requires bitsandbytes. These tests are skipped when they are not.
    """
    return unittest.skipUnless(is_bnb_available(), "test requires the bitsandbytes library")(test_case)


def require_tpu(test_case):
    """
    Decorator marking a test that requires TPUs. These tests are skipped when there are no TPUs available.
    """
    return unittest.skipUnless(is_torch_xla_available(check_is_tpu=True), "test requires TPU")(test_case)


def require_non_torch_xla(test_case):
    """
    Decorator marking a test as requiring an environment without TorchXLA. These tests are skipped when TorchXLA is
    available.
    """
    return unittest.skipUnless(not is_torch_xla_available(), "test requires an env without TorchXLA")(test_case)


def require_single_device(test_case):
    """
    Decorator marking a test that requires a single device. These tests are skipped when there is no hardware
    accelerator available or number of devices is more than one.
    """
    return unittest.skipUnless(
        torch_device != "cpu" and device_count == 1, "test requires a single device accelerator"
    )(test_case)


def require_single_gpu(test_case):
    """
    Decorator marking a test that requires CUDA on a single GPU. These tests are skipped when there are no GPU
    available or number of GPUs is more than one.
    """
    return unittest.skipUnless(torch.cuda.device_count() == 1, "test requires a GPU")(test_case)


def require_single_xpu(test_case):
    """
    Decorator marking a test that requires CUDA on a single XPU. These tests are skipped when there are no XPU
    available or number of xPUs is more than one.
    """
    return unittest.skipUnless(torch.xpu.device_count() == 1, "test requires a XPU")(test_case)


def require_multi_device(test_case):
    """
    Decorator marking a test that requires a multi-device setup. These tests are skipped on a machine without multiple
    devices.
    """
    return unittest.skipUnless(device_count > 1, "test requires multiple hardware accelerators")(test_case)


def require_multi_gpu(test_case):
    """
    Decorator marking a test that requires a multi-GPU setup. These tests are skipped on a machine without multiple
    GPUs.
    """
    return unittest.skipUnless(torch.cuda.device_count() > 1, "test requires multiple GPUs")(test_case)


def require_multi_xpu(test_case):
    """
    Decorator marking a test that requires a multi-XPU setup. These tests are skipped on a machine without multiple
    XPUs.
    """
    return unittest.skipUnless(torch.xpu.device_count() > 1, "test requires multiple XPUs")(test_case)


def require_multi_gpu_or_xpu(test_case):
    """
    Decorator marking a test that requires a multi-GPU setup. These tests are skipped on a machine without multiple
    GPUs or XPUs.
    """
    return unittest.skipUnless(
        (is_cuda_available() or is_xpu_available()) and device_count > 1, "test requires multiple GPUs or XPUs"
    )(test_case)


def require_deepspeed(test_case):
    """
    Decorator marking a test that requires DeepSpeed installed. These tests are skipped when DeepSpeed isn't installed
    """
    return unittest.skipUnless(is_deepspeed_available(), "test requires DeepSpeed")(test_case)


def require_tp(test_case):
    """
    Decorator marking a test that requires TP installed. These tests are skipped when TP isn't installed
    """
    return unittest.skipUnless(
        is_torch_version(">=", "2.3.0") and compare_versions("transformers", ">=", "4.52.0"),
        "test requires torch version >= 2.3.0 and transformers version >= 4.52.0",
    )(test_case)


def require_torch_min_version(test_case=None, version=None):
    """
    Decorator marking that a test requires a particular torch version to be tested. These tests are skipped when an
    installed torch version is less than the required one.
    """
    if test_case is None:
        return partial(require_torch_min_version, version=version)
    return unittest.skipUnless(is_torch_version(">=", version), f"test requires torch version >= {version}")(test_case)


def require_tensorboard(test_case):
    """
    Decorator marking a test that requires tensorboard installed. These tests are skipped when tensorboard isn't
    installed
    """
    return unittest.skipUnless(is_tensorboard_available(), "test requires Tensorboard")(test_case)


def require_wandb(test_case):
    """
    Decorator marking a test that requires wandb installed. These tests are skipped when wandb isn't installed
    """
    return unittest.skipUnless(is_wandb_available(), "test requires wandb")(test_case)


def require_trackio(test_case):
    """
    Decorator marking a test that requires trackio installed. These tests are skipped when trackio isn't installed
    """
    return unittest.skipUnless(is_trackio_available(), "test requires trackio")(test_case)


def require_comet_ml(test_case):
    """
    Decorator marking a test that requires comet_ml installed. These tests are skipped when comet_ml isn't installed
    """
    return unittest.skipUnless(is_comet_ml_available(), "test requires comet_ml")(test_case)


def require_aim(test_case):
    """
    Decorator marking a test that requires aim installed. These tests are skipped when aim isn't installed
    """
    return unittest.skipUnless(is_aim_available(), "test requires aim")(test_case)


def require_clearml(test_case):
    """
    Decorator marking a test that requires clearml installed. These tests are skipped when clearml isn't installed
    """
    return unittest.skipUnless(is_clearml_available(), "test requires clearml")(test_case)


def require_dvclive(test_case):
    """
    Decorator marking a test that requires dvclive installed. These tests are skipped when dvclive isn't installed
    """
    return unittest.skipUnless(is_dvclive_available(), "test requires dvclive")(test_case)


def require_swanlab(test_case):
    """
    Decorator marking a test that requires swanlab installed. These tests are skipped when swanlab isn't installed
    """
    return unittest.skipUnless(is_swanlab_available(), "test requires swanlab")(test_case)


def require_pandas(test_case):
    """
    Decorator marking a test that requires pandas installed. These tests are skipped when pandas isn't installed
    """
    return unittest.skipUnless(is_pandas_available(), "test requires pandas")(test_case)


def require_mlflow(test_case):
    """
    Decorator marking a test that requires mlflow installed. These tests are skipped when mlflow isn't installed
    """
    return unittest.skipUnless(is_mlflow_available(), "test requires mlflow")(test_case)


def require_pippy(test_case):
    """
    Decorator marking a test that requires pippy installed. These tests are skipped when pippy isn't installed It is
    also checked if the test is running on a Gaudi1 device which doesn't support pippy.
    """
    return unittest.skipUnless(is_pippy_available() and not is_habana_gaudi1(), "test requires pippy")(test_case)


def require_import_timer(test_case):
    """
    Decorator marking a test that requires tuna interpreter installed. These tests are skipped when tuna isn't
    installed
    """
    return unittest.skipUnless(is_import_timer_available(), "test requires tuna interpreter")(test_case)


def require_transformer_engine(test_case):
    """
    Decorator marking a test that requires transformers engine installed. These tests are skipped when transformers
    engine isn't installed
    """
    return unittest.skipUnless(is_transformer_engine_available(), "test requires transformers engine")(test_case)


def require_transformer_engine_mxfp8(test_case):
    """
    Decorator marking a test that requires transformers engine MXFP8 block scaling available. These tests are skipped
    when transformers engine MXFP8 block scaling isn't available
    """
    return unittest.skipUnless(
        is_transformer_engine_mxfp8_available(), "test requires transformers engine MXFP8 block scaling"
    )(test_case)


def require_torchao(test_case):
    """
    Decorator marking a test that requires torchao installed. These tests are skipped when torchao isn't installed
    """
    return unittest.skipUnless(is_torchao_available(), "test requires torchao")(test_case)


def require_matplotlib(test_case):
    """
    Decorator marking a test that requires matplotlib installed. These tests are skipped when matplotlib isn't
    installed
    """
    return unittest.skipUnless(is_matplotlib_available(), "test requires matplotlib")(test_case)


_atleast_one_tracker_available = (
    any([is_wandb_available(), is_tensorboard_available(), is_trackio_available(), is_swanlab_available()])
    and not is_comet_ml_available()
)


def require_trackers(test_case):
    """
    Decorator marking that a test requires at least one tracking library installed. These tests are skipped when none
    are installed
    """
    return unittest.skipUnless(
        _atleast_one_tracker_available,
        "test requires at least one tracker to be available and for `comet_ml` to not be installed",
    )(test_case)


def require_torchdata_stateful_dataloader(test_case):
    """
    Decorator marking a test that requires torchdata.stateful_dataloader.

    These tests are skipped when torchdata with stateful_dataloader module isn't installed.

    """
    return unittest.skipUnless(
        is_torchdata_stateful_dataloader_available(), "test requires torchdata.stateful_dataloader"
    )(test_case)


def run_first(test_case):
    """
    Decorator marking a test with order(1). When pytest-order plugin is installed, tests marked with this decorator are
    guaranteed to run first.

    This is especially useful in some test settings like on a Gaudi instance where a Gaudi device can only be used by a
    single process at a time. So we make sure all tests that run in a subprocess are launched first, to avoid device
    allocation conflicts.

    If pytest is not installed, test will be returned as is.
    """

    if is_pytest_available():
        import pytest

        return pytest.mark.order(1)(test_case)
    return test_case


class TempDirTestCase(unittest.TestCase):
    """
    A TestCase class that keeps a single `tempfile.TemporaryDirectory` open for the duration of the class, wipes its
    data at the start of a test, and then destroys it at the end of the TestCase.

    Useful for when a class or API requires a single constant folder throughout it's use, such as Weights and Biases

    The temporary directory location will be stored in `self.tmpdir`
    """

    clear_on_setup = True

    @classmethod
    def setUpClass(cls):
        "Creates a `tempfile.TemporaryDirectory` and stores it in `cls.tmpdir`"
        cls.tmpdir = Path(tempfile.mkdtemp())

    @classmethod
    def tearDownClass(cls):
        "Remove `cls.tmpdir` after test suite has finished"
        if os.path.exists(cls.tmpdir):
            shutil.rmtree(cls.tmpdir)

    def setUp(self):
        "Destroy all contents in `self.tmpdir`, but not `self.tmpdir`"
        if self.clear_on_setup:
            for path in self.tmpdir.glob("**/*"):
                if path.is_file():
                    path.unlink()
                elif path.is_dir():
                    shutil.rmtree(path)


class AccelerateTestCase(unittest.TestCase):
    """
    A TestCase class that will reset the accelerator state at the end of every test. Every test that checks or utilizes
    the `AcceleratorState` class should inherit from this to avoid silent failures due to state being shared between
    tests.
    """

    def tearDown(self):
        super().tearDown()
        # Reset the state of the AcceleratorState singleton.
        AcceleratorState._reset_state(True)


class MockingTestCase(unittest.TestCase):
    """
    A TestCase class designed to dynamically add various mockers that should be used in every test, mimicking the
    behavior of a class-wide mock when defining one normally will not do.

    Useful when a mock requires specific information available only initialized after `TestCase.setUpClass`, such as
    setting an environment variable with that information.

    The `add_mocks` function should be ran at the end of a `TestCase`'s `setUp` function, after a call to
    `super().setUp()` such as:
    ```python
    def setUp(self):
        super().setUp()
        mocks = mock.patch.dict(os.environ, {"SOME_ENV_VAR", "SOME_VALUE"})
        self.add_mocks(mocks)
    ```
    """

    def add_mocks(self, mocks: Union[mock.Mock, list[mock.Mock]]):
        """
        Add custom mocks for tests that should be repeated on each test. Should be called during
        `MockingTestCase.setUp`, after `super().setUp()`.

        Args:
            mocks (`mock.Mock` or list of `mock.Mock`):
                Mocks that should be added to the `TestCase` after `TestCase.setUpClass` has been run
        """
        self.mocks = mocks if isinstance(mocks, (tuple, list)) else [mocks]
        for m in self.mocks:
            m.start()
            self.addCleanup(m.stop)


def are_the_same_tensors(tensor):
    state = AcceleratorState()
    tensor = tensor[None].clone().to(state.device)
    tensors = gather(tensor).cpu()
    tensor = tensor[0].cpu()
    for i in range(tensors.shape[0]):
        if not torch.equal(tensors[i], tensor):
            return False
    return True


class _RunOutput:
    def __init__(self, returncode, stdout, stderr):
        self.returncode = returncode
        self.stdout = stdout
        self.stderr = stderr


async def _read_stream(stream, callback):
    while True:
        line = await stream.readline()
        if line:
            callback(line)
        else:
            break


async def _stream_subprocess(cmd, env=None, stdin=None, timeout=None, quiet=False, echo=False) -> _RunOutput:
    if echo:
        print("\nRunning: ", " ".join(cmd))

    p = await asyncio.create_subprocess_exec(
        cmd[0],
        *cmd[1:],
        stdin=stdin,
        stdout=asyncio.subprocess.PIPE,
        stderr=asyncio.subprocess.PIPE,
        env=env,
    )

    # note: there is a warning for a possible deadlock when using `wait` with huge amounts of data in the pipe
    # https://docs.python.org/3/library/asyncio-subprocess.html#asyncio.asyncio.subprocess.Process.wait
    #
    # If it starts hanging, will need to switch to the following code. The problem is that no data
    # will be seen until it's done and if it hangs for example there will be no debug info.
    # out, err = await p.communicate()
    # return _RunOutput(p.returncode, out, err)

    out = []
    err = []

    def tee(line, sink, pipe, label=""):
        line = line.decode("utf-8").rstrip()
        sink.append(line)
        if not quiet:
            print(label, line, file=pipe)

    # XXX: the timeout doesn't seem to make any difference here
    await asyncio.wait(
        [
            asyncio.create_task(_read_stream(p.stdout, lambda l: tee(l, out, sys.stdout, label="stdout:"))),
            asyncio.create_task(_read_stream(p.stderr, lambda l: tee(l, err, sys.stderr, label="stderr:"))),
        ],
        timeout=timeout,
    )
    return _RunOutput(await p.wait(), out, err)


def execute_subprocess_async(cmd: list, env=None, stdin=None, timeout=180, quiet=False, echo=True) -> _RunOutput:
    # Cast every path in `cmd` to a string
    for i, c in enumerate(cmd):
        if isinstance(c, Path):
            cmd[i] = str(c)

    result = asyncio.run(_stream_subprocess(cmd, env=env, stdin=stdin, timeout=timeout, quiet=quiet, echo=echo))

    cmd_str = " ".join(cmd)
    if result.returncode > 0:
        stderr = "\n".join(result.stderr)
        raise RuntimeError(
            f"'{cmd_str}' failed with returncode {result.returncode}\n\n"
            f"The combined stderr from workers follows:\n{stderr}"
        )

    return result


def pytest_xdist_worker_id():
    """
    Returns an int value of worker's numerical id under `pytest-xdist`'s concurrent workers `pytest -n N` regime, or 0
    if `-n 1` or `pytest-xdist` isn't being used.
    """
    worker = os.environ.get("PYTEST_XDIST_WORKER", "gw0")
    worker = re.sub(r"^gw", "", worker, 0, re.M)
    return int(worker)


def get_torch_dist_unique_port():
    """
    Returns a port number that can be fed to `torch.distributed.launch`'s `--master_port` argument.

    Under `pytest-xdist` it adds a delta number based on a worker id so that concurrent tests don't try to use the same
    port at once.
    """
    port = 29500
    uniq_delta = pytest_xdist_worker_id()
    return port + uniq_delta


class SubprocessCallException(Exception):
    pass


def run_command(command: list[str], return_stdout=False, env=None):
    """
    Runs `command` with `subprocess.check_output` and will potentially return the `stdout`. Will also properly capture
    if an error occurred while running `command`
    """
    # Cast every path in `command` to a string
    for i, c in enumerate(command):
        if isinstance(c, Path):
            command[i] = str(c)
    if env is None:
        env = os.environ.copy()
    try:
        output = subprocess.check_output(command, stderr=subprocess.STDOUT, env=env)
        if return_stdout:
            if hasattr(output, "decode"):
                output = output.decode("utf-8")
            return output
    except subprocess.CalledProcessError as e:
        raise SubprocessCallException(
            f"Command `{' '.join(command)}` failed with the following error:\n\n{e.output.decode()}"
        ) from e


def path_in_accelerate_package(*components: str) -> Path:
    """
    Get a path within the `accelerate` package's directory.

    Args:
        *components: Components of the path to join after the package directory.

    Returns:
        `Path`: The path to the requested file or directory.
    """

    accelerate_package_dir = Path(inspect.getfile(accelerate)).parent
    return accelerate_package_dir.joinpath(*components)


@contextmanager
def assert_exception(exception_class: Exception, msg: Optional[str] = None) -> bool:
    """
    Context manager to assert that the right `Exception` class was raised.

    If `msg` is provided, will check that the message is contained in the raised exception.
    """
    was_ran = False
    try:
        yield
        was_ran = True
    except Exception as e:
        assert isinstance(e, exception_class), f"Expected exception of type {exception_class} but got {type(e)}"
        if msg is not None:
            assert msg in str(e), f"Expected message '{msg}' to be in exception but got '{str(e)}'"
    if was_ran:
        raise AssertionError(f"Expected exception of type {exception_class} but ran without issue.")


def capture_call_output(func, *args, **kwargs):
    """
    Takes in a `func` with `args` and `kwargs` and returns the captured stdout as a string
    """
    captured_output = io.StringIO()
    original_stdout = sys.stdout
    try:
        sys.stdout = captured_output
        func(*args, **kwargs)
    except Exception as e:
        raise e
    finally:
        sys.stdout = original_stdout
    return captured_output.getvalue()


================================================
FILE: src/accelerate/test_utils/training.py
================================================
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import torch
from torch.utils.data import DataLoader

from accelerate.utils.dataclasses import DistributedType


class RegressionDataset:
    def __init__(self, a=2, b=3, length=64, seed=None):
        rng = np.random.default_rng(seed)
        self.length = length
        self.x = rng.normal(size=(length,)).astype(np.float32)
        self.y = a * self.x + b + rng.normal(scale=0.1, size=(length,)).astype(np.float32)

    def __len__(self):
        return self.length

    def __getitem__(self, i):
        return {"x": self.x[i], "y": self.y[i]}


class RegressionModel(torch.nn.Module):
    def __init__(self, a=0, b=0, double_output=False):
        super().__init__()
        self.a = torch.nn.Parameter(torch.tensor(a).float())
        self.b = torch.nn.Parameter(torch.tensor(b).float())
        self.first_batch = True

    def forward(self, x=None):
        if self.first_batch:
            print(f"Model dtype: {self.a.dtype}, {self.b.dtype}. Input dtype: {x.dtype}")
            self.first_batch = False
        return x * self.a + self.b


def mocked_dataloaders(accelerator, batch_size: int = 16):
    from datasets import load_dataset
    from transformers import AutoTokenizer

    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    data_files = {"train": "tests/test_samples/MRPC/train.csv", "validation": "tests/test_samples/MRPC/dev.csv"}
    datasets = load_dataset("csv", data_files=data_files)
    label_list = datasets["train"].unique("label")

    label_to_id = {v: i for i, v in enumerate(label_list)}

    def tokenize_function(examples):
        # max_length=None => use the model max length (it's actually the default)
        outputs = tokenizer(
            examples["sentence1"], examples["sentence2"], truncation=True, max_length=None, padding="max_length"
        )
        if "label" in examples:
            outputs["labels"] = [label_to_id[l] for l in examples["label"]]
        return outputs

    # Apply the method we just defined to all the examples in all the splits of the dataset
    tokenized_datasets = datasets.map(
        tokenize_function,
        batched=True,
        remove_columns=["sentence1", "sentence2", "label"],
    )

    def collate_fn(examples):
        # On TPU it's best to pad everything to the same length or training will be very slow.
        if accelerator.distributed_type == DistributedType.XLA:
            return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
        return tokenizer.pad(examples, padding="longest", return_tensors="pt")

    # Instantiate dataloaders.
    train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=2)
    eval_dataloader = DataLoader(tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=1)

    return train_dataloader, eval_dataloader


def mocked_dataloaders_for_autoregressive_models(accelerator, batch_size: int = 16):
    from datasets import load_dataset
    from transformers import AutoTokenizer

    tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-360M")
    tokenizer.pad_token = tokenizer.eos_token

    data_files = {"train": "tests/test_samples/MRPC/train.csv", "validation": "tests/test_samples/MRPC/dev.csv"}
    datasets = load_dataset("csv", data_files=data_files)

    def tokenize_function(examples):
        # max_length=None => use the model max length (it's actually the default)
        outputs = tokenizer(examples["sentence1"], truncation=True, max_length=None, return_attention_mask=False)
        return outputs

    # Apply the method we just defined to all the examples in all the splits of the dataset
    # starting with the main process first:
    with accelerator.main_process_first():
        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            remove_columns=["sentence1", "sentence2", "label"],
        )

    def collate_fn(examples):
        # On TPU it's best to pad everything to the same length or training will be very slow.
        max_length = (
            128
            if accelerator.distributed_type == DistributedType.XLA
            else max([len(e["input_ids"]) for e in examples])
        )
        # When using mixed precision we want round multiples of 8/16
        if accelerator.mixed_precision == "fp8":
            pad_to_multiple_of = 16
        elif accelerator.mixed_precision != "no":
            pad_to_multiple_of = 8
        else:
            pad_to_multiple_of = None

        batch = tokenizer.pad(
            examples,
            padding="max_length",
            max_length=max_length + 1,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors="pt",
        )

        batch["labels"] = batch["input_ids"][:, 1:]
        batch["input_ids"] = batch["input_ids"][:, :-1]
        if "attention_mask" in batch:
            batch["attention_mask"] = batch["attention_mask"][:, :-1]

        batch["labels"] = torch.where(batch["labels"] == tokenizer.pad_token_id, -100, batch["labels"])

        return batch

    # Instantiate dataloaders.
    train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=False, collate_fn=collate_fn, batch_size=2)
    eval_dataloader = DataLoader(tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=1)

    return train_dataloader, eval_dataloader


================================================
FILE: src/accelerate/tracking.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Expectation:
# Provide a project dir name, then each type of logger gets stored in project/{`logging_dir`}

import json
import os
import time
from functools import wraps
from typing import Any, Optional, Union

import yaml
from packaging import version

from .logging import get_logger
from .state import PartialState
from .utils import (
    LoggerType,
    compare_versions,
    is_aim_available,
    is_clearml_available,
    is_comet_ml_available,
    is_dvclive_available,
    is_mlflow_available,
    is_swanlab_available,
    is_tensorboard_available,
    is_trackio_available,
    is_wandb_available,
    listify,
)


_available_trackers = []

if is_tensorboard_available():
    _available_trackers.append(LoggerType.TENSORBOARD)

if is_wandb_available():
    _available_trackers.append(LoggerType.WANDB)

if is_comet_ml_available():
    _available_trackers.append(LoggerType.COMETML)

if is_aim_available():
    _available_trackers.append(LoggerType.AIM)

if is_mlflow_available():
    _available_trackers.append(LoggerType.MLFLOW)

if is_clearml_available():
    _available_trackers.append(LoggerType.CLEARML)

if is_dvclive_available():
    _available_trackers.append(LoggerType.DVCLIVE)

if is_swanlab_available():
    _available_trackers.append(LoggerType.SWANLAB)

if is_trackio_available():
    _available_trackers.append(LoggerType.TRACKIO)

logger = get_logger(__name__)


def on_main_process(function):
    """
    Decorator to selectively run the decorated function on the main process only based on the `main_process_only`
    attribute in a class.

    Checks at function execution rather than initialization time, not triggering the initialization of the
    `PartialState`.
    """

    @wraps(function)
    def execute_on_main_process(self, *args, **kwargs):
        if getattr(self, "main_process_only", False):
            return PartialState().on_main_process(function)(self, *args, **kwargs)
        else:
            return function(self, *args, **kwargs)

    return execute_on_main_process


def get_available_trackers():
    "Returns a list of all supported available trackers in the system"
    return _available_trackers


class GeneralTracker:
    """
    A base Tracker class to be used for all logging integration implementations.

    Each function should take in `**kwargs` that will automatically be passed in from a base dictionary provided to
    [`Accelerator`].

    Should implement `name`, `requires_logging_directory`, and `tracker` properties such that:

    `name` (`str`): String representation of the tracker class name, such as "TensorBoard" `requires_logging_directory`
    (`bool`): Whether the logger requires a directory to store their logs. `tracker` (`object`): Should return internal
    tracking mechanism used by a tracker class (such as the `run` for wandb)

    Implementations can also include a `main_process_only` (`bool`) attribute to toggle if relevant logging, init, and
    other functions should occur on the main process or across all processes (by default will use `True`)
    """

    main_process_only = True

    def __init__(self, _blank=False):
        if not _blank:
            err = ""
            if not hasattr(self, "name"):
                err += "`name`"
            if not hasattr(self, "requires_logging_directory"):
                if len(err) > 0:
                    err += ", "
                err += "`requires_logging_directory`"

            # as tracker is a @property that relies on post-init
            if "tracker" not in dir(self):
                if len(err) > 0:
                    err += ", "
                err += "`tracker`"
            if len(err) > 0:
                raise NotImplementedError(
                    f"The implementation for this tracker class is missing the following "
                    f"required attributes. Please define them in the class definition: "
                    f"{err}"
                )

    def start(self):
        """
        Lazy initialization of the tracker inside Accelerator to avoid initializing PartialState before
        InitProcessGroupKwargs.
        """
        pass

    def store_init_configuration(self, values: dict):
        """
        Logs `values` as hyperparameters for the run. Implementations should use the experiment configuration
        functionality of a tracking API.

        Args:
            values (Dictionary `str` to `bool`, `str`, `float` or `int`):
                Values to be stored as initial hyperparameters as key-value pairs. The values need to have type `bool`,
                `str`, `float`, `int`, or `None`.
        """
        pass

    def log(self, values: dict, step: Optional[int], **kwargs):
        """
        Logs `values` to the current run. Base `log` implementations of a tracking API should go in here, along with
        special behavior for the `step parameter.

        Args:
            values (Dictionary `str` to `str`, `float`, or `int`):
                Values to be logged as key-value pairs. The values need to have type `str`, `float`, or `int`.
            step (`int`, *optional*):
                The run step. If included, the log will be affiliated with this step.
        """
        pass

    def finish(self):
        """
        Should run any finalizing functions within the tracking API. If the API should not have one, just don't
        overwrite that method.
        """
        pass


class TensorBoardTracker(GeneralTracker):
    """
    A `Tracker` class that supports `tensorboard`. Should be initialized at the start of your script.

    Args:
        run_name (`str`):
            The name of the experiment run
        logging_dir (`str`, `os.PathLike`):
            Location for TensorBoard logs to be stored.
        **kwargs (additional keyword arguments, *optional*):
            Additional key word arguments passed along to the `tensorboard.SummaryWriter.__init__` method.
    """

    name = "tensorboard"
    requires_logging_directory = True

    def __init__(self, run_name: str, logging_dir: Union[str, os.PathLike], **kwargs):
        super().__init__()
        self.run_name = run_name
        self.logging_dir_param = logging_dir
        self.init_kwargs = kwargs

    @on_main_process
    def start(self):
        try:
            from torch.utils import tensorboard
        except ModuleNotFoundError:
            import tensorboardX as tensorboard
        self.logging_dir = os.path.join(self.logging_dir_param, self.run_name)
        self.writer = tensorboard.SummaryWriter(self.logging_dir, **self.init_kwargs)
        logger.debug(f"Initialized TensorBoard project {self.run_name} logging to {self.logging_dir}")
        logger.debug(
            "Make sure to log any initial configurations with `self.store_init_configuration` before training!"
        )

    @property
    def tracker(self):
        return self.writer

    @on_main_process
    def store_init_configuration(self, values: dict):
        """
        Logs `values` as hyperparameters for the run. Should be run at the beginning of your experiment. Stores the
        hyperparameters in a yaml file for future use.

        Args:
            values (Dictionary `str` to `bool`, `str`, `float` or `int`):
                Values to be stored as initial hyperparameters as key-value pairs. The values need to have type `bool`,
                `str`, `float`, `int`, or `None`.
        """
        self.writer.add_hparams(values, metric_dict={})
        self.writer.flush()
        project_run_name = time.time()
        dir_name = os.path.join(self.logging_dir, str(project_run_name))
        os.makedirs(dir_name, exist_ok=True)
        with open(os.path.join(dir_name, "hparams.yml"), "w") as outfile:
            try:
                yaml.dump(values, outfile)
            except yaml.representer.RepresenterError:
                logger.error("Serialization to store hyperparameters failed")
                raise
        logger.debug("Stored initial configuration hyperparameters to TensorBoard and hparams yaml file")

    @on_main_process
    def log(self, values: dict, step: Optional[int] = None, **kwargs):
        """
        Logs `values` to the current run.

        Args:
            values (Dictionary `str` to `str`, `float`, `int` or `dict` of `str` to `float`/`int`):
                Values to be logged as key-value pairs. The values need to have type `str`, `float`, `int` or `dict` of
                `str` to `float`/`int`.
            step (`int`, *optional*):
                The run step. If included, the log will be affiliated with this step.
            kwargs:
                Additional key word arguments passed along to either `SummaryWriter.add_scaler`,
                `SummaryWriter.add_text`, or `SummaryWriter.add_scalers` method based on the contents of `values`.
        """
        values = listify(values)
        for k, v in values.items():
            if isinstance(v, (int, float)):
                self.writer.add_scalar(k, v, global_step=step, **kwargs)
            elif isinstance(v, str):
                self.writer.add_text(k, v, global_step=step, **kwargs)
            elif isinstance(v, dict):
                self.writer.add_scalars(k, v, global_step=step, **kwargs)
        self.writer.flush()
        logger.debug("Successfully logged to TensorBoard")

    @on_main_process
    def log_images(self, values: dict, step: Optional[int], **kwargs):
        """
        Logs `images` to the current run.

        Args:
            values (Dictionary `str` to `List` of `np.ndarray` or `PIL.Image`):
                Values to be logged as key-value pairs. The values need to have type `List` of `np.ndarray` or
            step (`int`, *optional*):
                The run step. If included, the log will be affiliated with this step.
            kwargs:
                Additional key word arguments passed along to the `SummaryWriter.add_image` method.
        """
        for k, v in values.items():
            self.writer.add_images(k, v, global_step=step, **kwargs)
        logger.debug("Successfully logged images to TensorBoard")

    @on_main_process
    def finish(self):
        """
        Closes `TensorBoard` writer
        """
        self.writer.close()
        logger.debug("TensorBoard writer closed")


class WandBTracker(GeneralTracker):
    """
    A `Tracker` class that supports `wandb`. Should be initialized at the start of your script.

    Args:
        run_name (`str`):
            The name of the experiment run.
        **kwargs (additional keyword arguments, *optional*):
            Additional key word arguments passed along to the `wandb.init` method.
    """

    name = "wandb"
    requires_logging_directory = False
    main_process_only = False

    def __init__(self, run_name: str, **kwargs):
        super().__init__()
        self.run_name = run_name
        self.init_kwargs = kwargs

    @on_main_process
    def start(self):
        import wandb

        self.run = wandb.init(project=self.run_name, **self.init_kwargs)
        logger.debug(f"Initialized WandB project {self.run_name}")
        logger.debug(
            "Make sure to log any initial configurations with `self.store_init_configuration` before training!"
        )

    @property
    def tracker(self):
        return self.run

    @on_main_process
    def store_init_configuration(self, values: dict):
        """
        Logs `values` as hyperparameters for the run. Should be run at the beginning of your experiment.

        Args:
            values (Dictionary `str` to `bool`, `str`, `float` or `int`):
                Values to be stored as initial hyperparameters as key-value pairs. The values need to have type `bool`,
                `str`, `float`, `int`, or `None`.
        """
        import wandb

        wandb.config.update(values, allow_val_change=True)
        logger.debug("Stored initial configuration hyperparameters to WandB")

    @on_main_process
    def log(self, values: dict, step: Optional[int] = None, **kwargs):
        """
        Logs `values` to the current run.

        Args:
            values (Dictionary `str` to `str`, `float`, `int` or `dict` of `str` to `float`/`int`):
                Values to be logged as key-value pairs. The values need to have type `str`, `float`, `int` or `dict` of
                `str` to `float`/`int`.
            step (`int`, *optional*):
                The run step. If included, the log will be affiliated with this step.
            kwargs:
                Additional key word arguments passed along to the `wandb.log` method.
        """
        self.run.log(values, step=step, **kwargs)
        logger.debug("Successfully logged to WandB")

    @on_main_process
    def log_images(self, values: dict, step: Optional[int] = None, **kwargs):
        """
        Logs `images` to the current run.

        Args:
            values (Dictionary `str` to `List` of `np.ndarray` or `PIL.Image`):
                Values to be logged as key-value pairs. The values need to have type `List` of `np.ndarray` or
            step (`int`, *optional*):
                The run step. If included, the log will be affiliated with this step.
            kwargs:
                Additional key word arguments passed along to the `wandb.log` method.
        """
        import wandb

        for k, v in values.items():
            self.log({k: [wandb.Image(image) for image in v]}, step=step, **kwargs)
        logger.debug("Successfully logged images to WandB")

    @on_main_process
    def log_table(
        self,
        table_name: str,
        columns: Optional[list[str]] = None,
        data: Optional[list[list[Any]]] = None,
        dataframe: Any = None,
        step: Optional[int] = None,
        **kwargs,
    ):
        """
        Log a Table containing any object type (text, image, audio, video, molecule, html, etc). Can be defined either
        with `columns` and `data` or with `dataframe`.

        Args:
            table_name (`str`):
                The name to give to the logged table on the wandb workspace
            columns (list of `str`, *optional*):
                The name of the columns on the table
            data (List of List of Any data type, *optional*):
                The data to be logged in the table
            dataframe (Any data type, *optional*):
                The data to be logged in the table
            step (`int`, *optional*):
                The run step. If included, the log will be affiliated with this step.
        """
        import wandb

        values = {table_name: wandb.Table(columns=columns, data=data, dataframe=dataframe)}
        self.log(values, step=step, **kwargs)

    @on_main_process
    def finish(self):
        """
        Closes `wandb` writer
        """
        self.run.finish()
        logger.debug("WandB run closed")


class TrackioTracker(GeneralTracker):
    """
    A `Tracker` class that supports `trackio`. Should be initialized at the start of your script.

    Args:
        run_name (`str`):
            The name of the experiment run. Will be used as the `project` name when instantiating trackio.
        **kwargs (additional keyword arguments, *optional*):
            Additional key word arguments passed along to the `trackio.init` method. Refer to this
            [init](https://github.com/gradio-app/trackio/blob/814809552310468b13f84f33764f1369b4e5136c/trackio/__init__.py#L22)
            to see all supported key word arguments.
    """

    name = "trackio"
    requires_logging_directory = False
    main_process_only = False

    def __init__(self, run_name: str, **kwargs):
        super().__init__()
        self.run_name = run_name
        self.init_kwargs = kwargs

    @on_main_process
    def start(self):
        import trackio

        self.run = trackio.init(project=self.run_name, **self.init_kwargs)
        logger.debug(f"Initialized trackio project {self.run_name}")
        logger.debug(
            "Make sure to log any initial configurations with `self.store_init_configuration` before training!"
        )

    @property
    def tracker(self):
        return self.run

    @on_main_process
    def store_init_configuration(self, values: dict):
        """
        Logs `values` as hyperparameters for the run. Should be run at the beginning of your experiment.

        Args:
            values (Dictionary `str` to `bool`, `str`, `float` or `int`):
                Values to be stored as initial hyperparameters as key-value pairs. The values need to have type `bool`,
                `str`, `float`, `int`, or `None`.
        """
        import trackio

        trackio.config.update(values, allow_val_change=True)
        logger.debug("Stored initial configuration hyperparameters to trackio")

    @on_main_process
    def log(self, values: dict, step: Optional[int] = None, **kwargs):
        """
        Logs `values` to the current run.

        Args:
            values (Dictionary `str` to `str`, `float`, `int` or `dict` of `str` to `float`/`int`):
                Values to be logged as key-value pairs. The values need to have type `str`, `float`, `int` or `dict` of
                `str` to `float`/`int`.
            step (`int`, *optional*):
                The run step. If included, the log will be affiliated with this step.
            kwargs:
                Additional key word arguments passed along to the `trackio.log` method.
        """
        self.run.log(values, step=step, **kwargs)
        logger.debug("Successfully logged to trackio")

    @on_main_process
    def finish(self):
        """
        Closes `trackio` run
        """
        self.run.finish()
        logger.debug("trackio run closed")


class CometMLTracker(GeneralTracker):
    """
    A `Tracker` class that supports `comet_ml`. Should be initialized at the start of your script.

    API keys must be stored in a Comet config file.

    Note:
        For `comet_ml` versions < 3.41.0, additional keyword arguments are passed to `comet_ml.Experiment` instead:
        https://www.comet.com/docs/v2/api-and-sdk/python-sdk/reference/Experiment/#comet_ml.Experiment.__init__

    Args:
        run_name (`str`):
            The name of the experiment run.
        **kwargs (additional keyword arguments, *optional*):
            Additional key word arguments passed along to the `comet_ml.start` method:
            https://www.comet.com/docs/v2/api-and-sdk/python-sdk/reference/start/
    """

    name = "comet_ml"
    requires_logging_directory = False

    def __init__(self, run_name: str, **kwargs):
        super().__init__()
        self.run_name = run_name
        self.init_kwargs = kwargs

    @on_main_process
    def start(self):
        import comet_ml

        comet_version = version.parse(comet_ml.__version__)
        if compare_versions(comet_version, ">=", "3.41.0"):
            self.writer = comet_ml.start(project_name=self.run_name, **self.init_kwargs)
        else:
            logger.info("Update `comet_ml` (>=3.41.0) for experiment reuse and offline support.")
            self.writer = comet_ml.Experiment(project_name=self.run_name, **self.init_kwargs)

        logger.debug(f"Initialized CometML project {self.run_name}")
        logger.debug(
            "Make sure to log any initial configurations with `self.store_init_configuration` before training!"
        )

    @property
    def tracker(self):
        return self.writer

    @on_main_process
    def store_init_configuration(self, values: dict):
        """
        Logs `values` as hyperparameters for the run. Should be run at the beginning of your experiment.

        Args:
            values (Dictionary `str` to `bool`, `str`, `float` or `int`):
                Values to be stored as initial hyperparameters as key-value pairs. The values need to have type `bool`,
                `str`, `float`, `int`, or `None`.
        """
        self.writer.log_parameters(values)
        logger.debug("Stored initial configuration hyperparameters to Comet")

    @on_main_process
    def log(self, values: dict, step: Optional[int] = None, **kwargs):
        """
        Logs `values` to the current run.

        Args:
            values (Dictionary `str` to `str`, `float`, `int` or `dict` of `str` to `float`/`int`):
                Values to be logged as key-value pairs. The values need to have type `str`, `float`, `int` or `dict` of
                `str` to `float`/`int`.
            step (`int`, *optional*):
                The run step. If included, the log will be affiliated with this step.
            kwargs:
                Additional key word arguments passed along to either `Experiment.log_metric`, `Experiment.log_other`,
                or `Experiment.log_metrics` method based on the contents of `values`.
        """
        if step is not None:
            self.writer.set_step(step)
        for k, v in values.items():
            if isinstance(v, (int, float)):
                self.writer.log_metric(k, v, step=step, **kwargs)
            elif isinstance(v, str):
                self.writer.log_other(k, v, **kwargs)
            elif isinstance(v, dict):
                self.writer.log_metrics(v, step=step, **kwargs)
        logger.debug("Successfully logged to Comet")

    @on_main_process
    def finish(self):
        """
        Flush `comet-ml` writer
        """
        self.writer.end()
        logger.debug("Comet run flushed")


class AimTracker(GeneralTracker):
    """
    A `Tracker` class that supports `aim`. Should be initialized at the start of your script.

    Args:
        run_name (`str`):
            The name of the experiment run.
        **kwargs (additional keyword arguments, *optional*):
            Additional key word arguments passed along to the `Run.__init__` method.
    """

    name = "aim"
    requires_logging_directory = True

    def __init__(self, run_name: str, logging_dir: Optional[Union[str, os.PathLike]] = ".", **kwargs):
        super().__init__()
        self.run_name = run_name
        self.aim_repo_path = logging_dir
        self.init_kwargs = kwargs

    @on_main_process
    def start(self):
        from aim import Run

        self.writer = Run(repo=self.aim_repo_path, **self.init_kwargs)
        self.writer.name = self.run_name
        logger.debug(f"Initialized Aim project {self.run_name}")
        logger.debug(
            "Make sure to log any initial configurations with `self.store_init_configuration` before training!"
        )

    @property
    def tracker(self):
        return self.writer

    @on_main_process
    def store_init_configuration(self, values: dict):
        """
        Logs `values` as hyperparameters for the run. Should be run at the beginning of your experiment.

        Args:
            values (`dict`):
                Values to be stored as initial hyperparameters as key-value pairs.
        """
        self.writer["hparams"] = values

    @on_main_process
    def log(self, values: dict, step: Optional[int], **kwargs):
        """
        Logs `values` to the current run.

        Args:
            values (`dict`):
                Values to be logged as key-value pairs.
            step (`int`, *optional*):
                The run step. If included, the log will be affiliated with this step.
            kwargs:
                Additional key word arguments passed along to the `Run.track` method.
        """
        # Note: replace this with the dictionary support when merged
        for key, value in values.items():
            self.writer.track(value, name=key, step=step, **kwargs)

    @on_main_process
    def log_images(self, values: dict, step: Optional[int] = None, kwargs: Optional[dict[str, dict]] = None):
        """
        Logs `images` to the current run.

        Args:
            values (`Dict[str, Union[np.ndarray, PIL.Image, Tuple[np.ndarray, str], Tuple[PIL.Image, str]]]`):
                Values to be logged as key-value pairs. The values need to have type `np.ndarray` or PIL.Image. If a
                tuple is provided, the first element should be the image and the second element should be the caption.
            step (`int`, *optional*):
                The run step. If included, the log will be affiliated with this step.
            kwargs (`Dict[str, dict]`):
                Additional key word arguments passed along to the `Run.Image` and `Run.track` method specified by the
                keys `aim_image` and `track`, respectively.
        """
        import aim

        aim_image_kw = {}
        track_kw = {}

        if kwargs is not None:
            aim_image_kw = kwargs.get("aim_image", {})
            track_kw = kwargs.get("track", {})

        for key, value in values.items():
            if isinstance(value, tuple):
                img, caption = value
            else:
                img, caption = value, ""
            aim_image = aim.Image(img, caption=caption, **aim_image_kw)
            self.writer.track(aim_image, name=key, step=step, **track_kw)

    @on_main_process
    def finish(self):
        """
        Closes `aim` writer
        """
        self.writer.close()


class MLflowTracker(GeneralTracker):
    """
    A `Tracker` class that supports `mlflow`. Should be initialized at the start of your script.

    Args:
        experiment_name (`str`, *optional*):
            Name of the experiment. Environment variable MLFLOW_EXPERIMENT_NAME has priority over this argument.
        logging_dir (`str` or `os.PathLike`, defaults to `"."`):
            Location for mlflow logs to be stored.
        run_id (`str`, *optional*):
            If specified, get the run with the specified UUID and log parameters and metrics under that run. The run’s
            end time is unset and its status is set to running, but the run’s other attributes (source_version,
            source_type, etc.) are not changed. Environment variable MLFLOW_RUN_ID has priority over this argument.
        tags (`Dict[str, str]`, *optional*):
            An optional `dict` of `str` keys and values, or a `str` dump from a `dict`, to set as tags on the run. If a
            run is being resumed, these tags are set on the resumed run. If a new run is being created, these tags are
            set on the new run. Environment variable MLFLOW_TAGS has priority over this argument.
        nested_run (`bool`, *optional*, defaults to `False`):
            Controls whether run is nested in parent run. True creates a nested run. Environment variable
            MLFLOW_NESTED_RUN has priority over this argument.
        run_name (`str`, *optional*):
            Name of new run (stored as a mlflow.runName tag). Used only when `run_id` is unspecified.
        description (`str`, *optional*):
            An optional string that populates the description box of the run. If a run is being resumed, the
            description is set on the resumed run. If a new run is being created, the description is set on the new
            run.
    """

    name = "mlflow"
    requires_logging_directory = False

    def __init__(
        self,
        experiment_name: Optional[str] = None,
        logging_dir: Optional[Union[str, os.PathLike]] = None,
        run_id: Optional[str] = None,
        tags: Optional[Union[dict[str, Any], str]] = None,
        nested_run: Optional[bool] = False,
        run_name: Optional[str] = None,
        description: Optional[str] = None,
    ):
        experiment_name = os.environ.get("MLFLOW_EXPERIMENT_NAME", experiment_name)
        run_id = os.environ.get("MLFLOW_RUN_ID", run_id)
        tags = os.environ.get("MLFLOW_TAGS", tags)
        if isinstance(tags, str):
            tags = json.loads(tags)

        nested_run = os.environ.get("MLFLOW_NESTED_RUN", nested_run)

        self.experiment_name = experiment_name
        self.logging_dir = logging_dir
        self.run_id = run_id
        self.tags = tags
        self.nested_run = nested_run
        self.run_name = run_name
        self.description = description

    @on_main_process
    def start(self):
        import mlflow

        exps = mlflow.search_experiments(filter_string=f"name = '{self.experiment_name}'")
        if len(exps) > 0:
            if len(exps) > 1:
                logger.warning("Multiple experiments with the same name found. Using first one.")
            experiment_id = exps[0].experiment_id
        else:
            experiment_id = mlflow.create_experiment(
                name=self.experiment_name,
                artifact_location=self.logging_dir,
                tags=self.tags,
            )

        self.active_run = mlflow.start_run(
            run_id=self.run_id,
            experiment_id=experiment_id,
            run_name=self.run_name,
            nested=self.nested_run,
            tags=self.tags,
            description=self.description,
        )

        logger.debug(f"Initialized mlflow experiment {self.experiment_name}")
        logger.debug(
            "Make sure to log any initial configurations with `self.store_init_configuration` before training!"
        )

    @property
    def tracker(self):
        return self.active_run

    @on_main_process
    def store_init_configuration(self, values: dict):
        """
        Logs `values` as hyperparameters for the run. Should be run at the beginning of your experiment.

        Args:
            values (`dict`):
                Values to be stored as initial hyperparameters as key-value pairs.
        """
        import mlflow

        for name, value in list(values.items()):
            # internally, all values are converted to str in MLflow
            if len(str(value)) > mlflow.utils.validation.MAX_PARAM_VAL_LENGTH:
                logger.warning_once(
                    f'Accelerate is attempting to log a value of "{value}" for key "{name}" as a parameter. MLflow\'s'
                    f" log_param() only accepts values no longer than {mlflow.utils.validation.MAX_PARAM_VAL_LENGTH} characters so we dropped this attribute."
                )
                del values[name]

        values_list = list(values.items())

        # MLflow cannot log more than 100 values in one go, so we have to split it
        for i in range(0, len(values_list), mlflow.utils.validation.MAX_PARAMS_TAGS_PER_BATCH):
            mlflow.log_params(dict(values_list[i : i + mlflow.utils.validation.MAX_PARAMS_TAGS_PER_BATCH]))

        logger.debug("Stored initial configuration hyperparameters to MLflow")

    @on_main_process
    def log(self, values: dict, step: Optional[int]):
        """
        Logs `values` to the current run.

        Args:
            values (`dict`):
                Values to be logged as key-value pairs.
            step (`int`, *optional*):
                The run step. If included, the log will be affiliated with this step.
        """
        metrics = {}
        for k, v in values.items():
            if isinstance(v, (int, float)):
                metrics[k] = v
            else:
                logger.warning_once(
                    f'MLflowTracker is attempting to log a value of "{v}" of type {type(v)} for key "{k}" as a metric. '
                    "MLflow's log_metric() only accepts float and int types so we dropped this attribute."
                )
        import mlflow

        mlflow.log_metrics(metrics, step=step)
        logger.debug("Successfully logged to mlflow")

    @on_main_process
    def log_figure(self, figure: Any, artifact_file: str, **save_kwargs):
        """
        Logs an figure to the current run.

        Args:
            figure (Any):
            The figure to be logged.
            artifact_file (`str`, *optional*):
            The run-relative artifact file path in posixpath format to which the image is saved.
            If not provided, the image is saved to a default location.
            **kwargs:
            Additional keyword arguments passed to the underlying mlflow.log_image function.
        """
        import mlflow

        mlflow.log_figure(figure=figure, artifact_file=artifact_file, **save_kwargs)
        logger.debug("Successfully logged image to mlflow")

    @on_main_process
    def log_artifacts(self, local_dir: str, artifact_path: Optional[str] = None):
        """
        Logs an artifacts (all content of a dir) to the current run.

            local_dir (`str`):
                Path to the directory to be logged as an artifact.
            artifact_path (`str`, *optional*):
                Directory within the run's artifact directory where the artifact will be logged. If omitted, the
                artifact will be logged to the root of the run's artifact directory. The run step. If included, the
                artifact will be affiliated with this step.
        """
        import mlflow

        mlflow.log_artifacts(local_dir=local_dir, artifact_path=artifact_path)
        logger.debug("Successfully logged artofact to mlflow")

    @on_main_process
    def log_artifact(self, local_path: str, artifact_path: Optional[str] = None):
        """
        Logs an artifact (file) to the current run.

            local_path (`str`):
                Path to the file to be logged as an artifact.
            artifact_path (`str`, *optional*):
                Directory within the run's artifact directory where the artifact will be logged. If omitted, the
                artifact will be logged to the root of the run's artifact directory. The run step. If included, the
                artifact will be affiliated with this step.
        """
        import mlflow

        mlflow.log_artifact(local_path=local_path, artifact_path=artifact_path)
        logger.debug("Successfully logged artofact to mlflow")

    @on_main_process
    def finish(self):
        """
        End the active MLflow run.
        """
        import mlflow

        mlflow.end_run()


class ClearMLTracker(GeneralTracker):
    """
    A `Tracker` class that supports `clearml`. Should be initialized at the start of your script.

    Args:
        run_name (`str`, *optional*):
            Name of the experiment. Environment variables `CLEARML_PROJECT` and `CLEARML_TASK` have priority over this
            argument.
        **kwargs (additional keyword arguments, *optional*):
            Kwargs passed along to the `Task.__init__` method.
    """

    name = "clearml"
    requires_logging_directory = False

    def __init__(self, run_name: Optional[str] = None, **kwargs):
        super().__init__()
        self.user_provided_run_name = run_name
        self._initialized_externally = False
        self.init_kwargs = kwargs

    @on_main_process
    def start(self):
        from clearml import Task

        current_task = Task.current_task()
        if current_task:
            self._initialized_externally = True
            self.task = current_task
            return

        task_init_args = {**self.init_kwargs}
        task_init_args.setdefault("project_name", os.environ.get("CLEARML_PROJECT", self.user_provided_run_name))
        task_init_args.setdefault("task_name", os.environ.get("CLEARML_TASK", self.user_provided_run_name))
        self.task = Task.init(**task_init_args)

    @property
    def tracker(self):
        return self.task

    @on_main_process
    def store_init_configuration(self, values: dict):
        """
        Connect configuration dictionary to the Task object. Should be run at the beginning of your experiment.

        Args:
            values (`dict`):
                Values to be stored as initial hyperparameters as key-value pairs.
        """
        return self.task.connect_configuration(values)

    @on_main_process
    def log(self, values: dict[str, Union[int, float]], step: Optional[int] = None, **kwargs):
        """
        Logs `values` dictionary to the current run. The dictionary keys must be strings. The dictionary values must be
        ints or floats

        Args:
            values (`Dict[str, Union[int, float]]`):
                Values to be logged as key-value pairs. If the key starts with 'eval_'/'test_'/'train_', the value will
                be reported under the 'eval'/'test'/'train' series and the respective prefix will be removed.
                Otherwise, the value will be reported under the 'train' series, and no prefix will be removed.
            step (`int`, *optional*):
                If specified, the values will be reported as scalars, with the iteration number equal to `step`.
                Otherwise they will be reported as single values.
            kwargs:
                Additional key word arguments passed along to the `clearml.Logger.report_single_value` or
                `clearml.Logger.report_scalar` methods.
        """
        clearml_logger = self.task.get_logger()
        for k, v in values.items():
            if not isinstance(v, (int, float)):
                logger.warning_once(
                    "Accelerator is attempting to log a value of "
                    f'"{v}" of type {type(v)} for key "{k}" as a scalar. '
                    "This invocation of ClearML logger's  report_scalar() "
                    "is incorrect so we dropped this attribute."
                )
                continue
            if step is None:
                clearml_logger.report_single_value(name=k, value=v, **kwargs)
                continue
            title, series = ClearMLTracker._get_title_series(k)
            clearml_logger.report_scalar(title=title, series=series, value=v, iteration=step, **kwargs)

    @on_main_process
    def log_images(self, values: dict, step: Optional[int] = None, **kwargs):
        """
        Logs `images` to the current run.

        Args:
            values (`Dict[str, List[Union[np.ndarray, PIL.Image]]`):
                Values to be logged as key-value pairs. The values need to have type `List` of `np.ndarray` or
            step (`int`, *optional*):
                The run step. If included, the log will be affiliated with this step.
            kwargs:
                Additional key word arguments passed along to the `clearml.Logger.report_image` method.
        """
        clearml_logger = self.task.get_logger()
        for k, v in values.items():
            title, series = ClearMLTracker._get_title_series(k)
            clearml_logger.report_image(title=title, series=series, iteration=step, image=v, **kwargs)

    @on_main_process
    def log_table(
        self,
        table_name: str,
        columns: Optional[list[str]] = None,
        data: Optional[list[list[Any]]] = None,
        dataframe: Any = None,
        step: Optional[int] = None,
        **kwargs,
    ):
        """
        Log a Table to the task. Can be defined eitherwith `columns` and `data` or with `dataframe`.

        Args:
            table_name (`str`):
                The name of the table
            columns (list of `str`, *optional*):
                The name of the columns on the table
            data (List of List of Any data type, *optional*):
                The data to be logged in the table. If `columns` is not specified, then the first entry in data will be
                the name of the columns of the table
            dataframe (Any data type, *optional*):
                The data to be logged in the table
            step (`int`, *optional*):
                The run step. If included, the log will be affiliated with this step.
            kwargs:
                Additional key word arguments passed along to the `clearml.Logger.report_table` method.
        """
        to_report = dataframe
        if dataframe is None:
            if data is None:
                raise ValueError(
                    "`ClearMLTracker.log_table` requires that `data` to be supplied if `dataframe` is `None`"
                )
            to_report = [columns] + data if columns else data
        title, series = ClearMLTracker._get_title_series(table_name)
        self.task.get_logger().report_table(title=title, series=series, table_plot=to_report, iteration=step, **kwargs)

    @on_main_process
    def finish(self):
        """
        Close the ClearML task. If the task was initialized externally (e.g. by manually calling `Task.init`), this
        function is a noop
        """
        if self.task and not self._initialized_externally:
            self.task.close()

    @staticmethod
    def _get_title_series(name):
        for prefix in ["eval", "test", "train"]:
            if name.startswith(prefix + "_"):
                return name[len(prefix) + 1 :], prefix
        return name, "train"


class DVCLiveTracker(GeneralTracker):
    """
    A `Tracker` class that supports `dvclive`. Should be initialized at the start of your script.

    Args:
        run_name (`str`, *optional*):
            Ignored for dvclive. See `kwargs` instead.
        kwargs:
            Additional key word arguments passed along to [`dvclive.Live()`](https://dvc.org/doc/dvclive/live).

    Example:

    ```py
    from accelerate import Accelerator

    accelerator = Accelerator(log_with="dvclive")
    accelerator.init_trackers(project_name="my_project", init_kwargs={"dvclive": {"dir": "my_directory"}})
    ```
    """

    name = "dvclive"
    requires_logging_directory = False

    def __init__(self, run_name: Optional[str] = None, live: Optional[Any] = None, **kwargs):
        super().__init__()
        self.live = live
        self.init_kwargs = kwargs

    @on_main_process
    def start(self):
        from dvclive import Live

        self.live = self.live if self.live is not None else Live(**self.init_kwargs)

    @property
    def tracker(self):
        return self.live

    @on_main_process
    def store_init_configuration(self, values: dict):
        """
        Logs `values` as hyperparameters for the run. Should be run at the beginning of your experiment. Stores the
        hyperparameters in a yaml file for future use.

        Args:
            values (Dictionary `str` to `bool`, `str`, `float`, `int`, or a List or Dict of those types):
                Values to be stored as initial hyperparameters as key-value pairs. The values need to have type `bool`,
                `str`, `float`, or `int`.
        """
        self.live.log_params(values)

    @on_main_process
    def log(self, values: dict, step: Optional[int] = None, **kwargs):
        """
        Logs `values` to the current run.

        Args:
            values (Dictionary `str` to `str`, `float`, or `int`):
                Values to be logged as key-value pairs. The values need to have type `str`, `float`, or `int`.
            step (`int`, *optional*):
                The run step. If included, the log will be affiliated with this step.
            kwargs:
                Additional key word arguments passed along to `dvclive.Live.log_metric()`.
        """
        from dvclive.plots import Metric

        if step is not None:
            self.live.step = step
        for k, v in values.items():
            if Metric.could_log(v):
                self.live.log_metric(k, v, **kwargs)
            else:
                logger.warning_once(
                    "Accelerator attempted to log a value of "
                    f'"{v}" of type {type(v)} for key "{k}" as a scalar. '
                    "This invocation of DVCLive's Live.log_metric() "
                    "is incorrect so we dropped this attribute."
                )
        self.live.next_step()

    @on_main_process
    def finish(self):
        """
        Closes `dvclive.Live()`.
        """
        self.live.end()


class SwanLabTracker(GeneralTracker):
    """
    A `Tracker` class that supports `swanlab`. Should be initialized at the start of your script.

    Args:
        run_name (`str`):
            The name of the experiment run.
        **kwargs (additional keyword arguments, *optional*):
            Additional key word arguments passed along to the `swanlab.init` method.
    """

    name = "swanlab"
    requires_logging_directory = False
    main_process_only = False

    def __init__(self, run_name: str, **kwargs):
        super().__init__()
        self.run_name = run_name
        self.init_kwargs = kwargs

    @on_main_process
    def start(self):
        import swanlab

        self.run = swanlab.init(project=self.run_name, **self.init_kwargs)
        swanlab.config["FRAMEWORK"] = "🤗Accelerate"  # add accelerate logo in config
        logger.debug(f"Initialized SwanLab project {self.run_name}")
        logger.debug(
            "Make sure to log any initial configurations with `self.store_init_configuration` before training!"
        )

    @property
    def tracker(self):
        return self.run

    @on_main_process
    def store_init_configuration(self, values: dict):
        """
        Logs `values` as hyperparameters for the run. Should be run at the beginning of your experiment.

        Args:
            values (Dictionary `str` to `bool`, `str`, `float` or `int`):
                Values to be stored as initial hyperparameters as key-value pairs. The values need to have type `bool`,
                `str`, `float`, `int`, or `None`.
        """
        import swanlab

        swanlab.config.update(values, allow_val_change=True)
        logger.debug("Stored initial configuration hyperparameters to SwanLab")

    @on_main_process
    def log(self, values: dict, step: Optional[int] = None, **kwargs):
        """
        Logs `values` to the current run.

        Args:
        data : Dict[str, DataType]
            Data must be a dict. The key must be a string with 0-9, a-z, A-Z, " ", "_", "-", "/". The value must be a
            `float`, `float convertible object`, `int` or `swanlab.data.BaseType`.
        step : int, optional
            The step number of the current data, if not provided, it will be automatically incremented.
        If step is duplicated, the data will be ignored.
            kwargs:
                Additional key word arguments passed along to the `swanlab.log` method. Likes:
                    print_to_console : bool, optional
                        Whether to print the data to the console, the default is False.
        """
        self.run.log(values, step=step, **kwargs)
        logger.debug("Successfully logged to SwanLab")

    @on_main_process
    def log_images(self, values: dict, step: Optional[int] = None, **kwargs):
        """
        Logs `images` to the current run.

        Args:
            values (Dictionary `str` to `List` of `np.ndarray` or `PIL.Image`):
                Values to be logged as key-value pairs. The values need to have type `List` of `np.ndarray` or
            step (`int`, *optional*):
                The run step. If included, the log will be affiliated with this step.
            kwargs:
                Additional key word arguments passed along to the `swanlab.log` method. Likes:
                    print_to_console : bool, optional
                        Whether to print the data to the console, the default is False.
        """
        import swanlab

        for k, v in values.items():
            self.log({k: [swanlab.Image(image) for image in v]}, step=step, **kwargs)
        logger.debug("Successfully logged images to SwanLab")

    @on_main_process
    def finish(self):
        """
        Closes `swanlab` writer
        """
        self.run.finish()
        logger.debug("SwanLab run closed")


LOGGER_TYPE_TO_CLASS = {
    "aim": AimTracker,
    "comet_ml": CometMLTracker,
    "mlflow": MLflowTracker,
    "tensorboard": TensorBoardTracker,
    "wandb": WandBTracker,
    "clearml": ClearMLTracker,
    "dvclive": DVCLiveTracker,
    "swanlab": SwanLabTracker,
    "trackio": TrackioTracker,
}


def filter_trackers(
    log_with: list[Union[str, LoggerType, GeneralTracker]],
    logging_dir: Optional[Union[str, os.PathLike]] = None,
):
    """
    Takes in a list of potential tracker types and checks that:
        - The tracker wanted is available in that environment
        - Filters out repeats of tracker types
        - If `all` is in `log_with`, will return all trackers in the environment
        - If a tracker requires a `logging_dir`, ensures that `logging_dir` is not `None`

    Args:
        log_with (list of `str`, [`~utils.LoggerType`] or [`~tracking.GeneralTracker`], *optional*):
            A list of loggers to be setup for experiment tracking. Should be one or several of:

            - `"all"`
            - `"tensorboard"`
            - `"wandb"`
            - `"trackio"`
            - `"aim"`
            - `"comet_ml"`
            - `"mlflow"`
            - `"dvclive"`
            - `"swanlab"`
            If `"all"` is selected, will pick up all available trackers in the environment and initialize them. Can
            also accept implementations of `GeneralTracker` for custom trackers, and can be combined with `"all"`.
        logging_dir (`str`, `os.PathLike`, *optional*):
            A path to a directory for storing logs of locally-compatible loggers.
    """
    loggers = []
    if log_with is not None:
        if not isinstance(log_with, (list, tuple)):
            log_with = [log_with]
        if "all" in log_with or LoggerType.ALL in log_with:
            loggers = [o for o in log_with if issubclass(type(o), GeneralTracker)] + get_available_trackers()
        else:
            for log_type in log_with:
                if log_type not in LoggerType and not issubclass(type(log_type), GeneralTracker):
                    raise ValueError(f"Unsupported logging capability: {log_type}. Choose between {LoggerType.list()}")
                if issubclass(type(log_type), GeneralTracker):
                    loggers.append(log_type)
                else:
                    log_type = LoggerType(log_type)
                    if log_type not in loggers:
                        if log_type in get_available_trackers():
                            tracker_init = LOGGER_TYPE_TO_CLASS[str(log_type)]
                            if tracker_init.requires_logging_directory:
                                if logging_dir is None:
                                    raise ValueError(
                                        f"Logging with `{log_type}` requires a `logging_dir` to be passed in."
                                    )
                            loggers.append(log_type)
                        else:
                            logger.debug(f"Tried adding logger {log_type}, but package is unavailable in the system.")

    return loggers


================================================
FILE: src/accelerate/utils/__init__.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ..parallelism_config import ParallelismConfig
from .ao import convert_model_to_fp8_ao, filter_first_and_last_linear_layers, has_ao_layers
from .constants import (
    MITA_PROFILING_AVAILABLE_PYTORCH_VERSION,
    MODEL_NAME,
    OPTIMIZER_NAME,
    PROFILE_PATTERN_NAME,
    RNG_STATE_NAME,
    SAFE_MODEL_NAME,
    SAFE_WEIGHTS_INDEX_NAME,
    SAFE_WEIGHTS_NAME,
    SAFE_WEIGHTS_PATTERN_NAME,
    SAMPLER_NAME,
    SCALER_NAME,
    SCHEDULER_NAME,
    TORCH_DISTRIBUTED_OPERATION_TYPES,
    TORCH_LAUNCH_PARAMS,
    WEIGHTS_INDEX_NAME,
    WEIGHTS_NAME,
    WEIGHTS_PATTERN_NAME,
    XPU_PROFILING_AVAILABLE_PYTORCH_VERSION,
)
from .dataclasses import (
    AORecipeKwargs,
    AutocastKwargs,
    BnbQuantizationConfig,
    ComputeEnvironment,
    CustomDtype,
    DataLoaderConfiguration,
    DDPCommunicationHookType,
    DeepSpeedPlugin,
    DeepSpeedSequenceParallelConfig,
    DistributedDataParallelKwargs,
    DistributedType,
    DynamoBackend,
    FP8RecipeKwargs,
    FullyShardedDataParallelPlugin,
    GradientAccumulationPlugin,
    GradScalerKwargs,
    InitProcessGroupKwargs,
    KwargsHandler,
    LoggerType,
    MegatronLMPlugin,
    MSAMPRecipeKwargs,
    PrecisionType,
    ProfileKwargs,
    ProjectConfiguration,
    RNGType,
    SageMakerDistributedType,
    TensorInformation,
    TERecipeKwargs,
    TorchContextParallelConfig,
    TorchDynamoPlugin,
    TorchTensorParallelConfig,
    TorchTensorParallelPlugin,
    add_model_config_to_megatron_parser,
)
from .environment import (
    are_libraries_initialized,
    check_cuda_fp8_capability,
    check_cuda_p2p_ib_support,
    clear_environment,
    convert_dict_to_env_variables,
    get_cpu_distributed_information,
    get_current_device_type,
    get_gpu_info,
    get_int_from_env,
    parse_choice_from_env,
    parse_flag_from_env,
    patch_environment,
    purge_accelerate_environment,
    set_numa_affinity,
    str_to_bool,
)
from .imports import (
    deepspeed_required,
    is_4bit_bnb_available,
    is_8bit_bnb_available,
    is_aim_available,
    is_bf16_available,
    is_bitsandbytes_multi_backend_available,
    is_bnb_available,
    is_boto3_available,
    is_clearml_available,
    is_comet_ml_available,
    is_cuda_available,
    is_datasets_available,
    is_deepspeed_available,
    is_dvclive_available,
    is_fp8_available,
    is_fp16_available,
    is_habana_gaudi1,
    is_hpu_available,
    is_import_timer_available,
    is_lomo_available,
    is_matplotlib_available,
    is_megatron_lm_available,
    is_mlflow_available,
    is_mlu_available,
    is_mps_available,
    is_msamp_available,
    is_musa_available,
    is_neuron_available,
    is_npu_available,
    is_pandas_available,
    is_peft_available,
    is_pippy_available,
    is_pynvml_available,
    is_pytest_available,
    is_rich_available,
    is_sagemaker_available,
    is_schedulefree_available,
    is_sdaa_available,
    is_swanlab_available,
    is_tensorboard_available,
    is_timm_available,
    is_torch_xla_available,
    is_torchao_available,
    is_torchdata_available,
    is_torchdata_stateful_dataloader_available,
    is_torchvision_available,
    is_trackio_available,
    is_transformer_engine_available,
    is_transformer_engine_mxfp8_available,
    is_transformers_available,
    is_triton_available,
    is_wandb_available,
    is_weights_only_available,
    is_xccl_available,
    is_xpu_available,
    torchao_required,
)
from .modeling import (
    align_module_device,
    calculate_maximum_sizes,
    check_device_map,
    check_tied_parameters_in_config,
    check_tied_parameters_on_same_device,
    compute_module_sizes,
    convert_file_size_to_int,
    dtype_byte_size,
    find_tied_parameters,
    get_balanced_memory,
    get_grad_scaler,
    get_max_layer_size,
    get_max_memory,
    get_mixed_precision_context_manager,
    has_offloaded_params,
    id_tensor_storage,
    infer_auto_device_map,
    is_peft_model,
    load_checkpoint_in_model,
    load_offloaded_weights,
    load_state_dict,
    named_module_tensors,
    retie_parameters,
    set_module_tensor_to_device,
)
from .offload import (
    OffloadedWeightsLoader,
    PrefixedDataset,
    extract_submodules_state_dict,
    load_offloaded_weight,
    offload_state_dict,
    offload_weight,
    save_offload_index,
)
from .operations import (
    CannotPadNestedTensorWarning,
    GatheredParameters,
    broadcast,
    broadcast_object_list,
    concatenate,
    convert_outputs_to_fp32,
    convert_to_fp32,
    copy_tensor_to_devices,
    find_batch_size,
    find_device,
    gather,
    gather_object,
    get_data_structure,
    honor_type,
    ignorant_find_batch_size,
    initialize_tensors,
    is_namedtuple,
    is_tensor_information,
    is_torch_tensor,
    listify,
    pad_across_processes,
    pad_input_tensors,
    recursively_apply,
    reduce,
    send_to_device,
    slice_tensors,
)
from .versions import compare_versions, is_torch_version


if is_deepspeed_available():
    from .deepspeed import (
        DeepSpeedEngineWrapper,
        DeepSpeedOptimizerWrapper,
        DeepSpeedSchedulerWrapper,
        DummyOptim,
        DummyScheduler,
        HfDeepSpeedConfig,
        get_active_deepspeed_plugin,
        map_pytorch_optim_to_deepspeed,
    )

from .bnb import has_4bit_bnb_layers, load_and_quantize_model
from .fsdp_utils import (
    disable_fsdp_ram_efficient_loading,
    enable_fsdp_ram_efficient_loading,
    ensure_weights_retied,
    fsdp2_apply_ac,
    fsdp2_canonicalize_names,
    fsdp2_load_full_state_dict,
    fsdp2_prepare_model,
    fsdp2_switch_optimizer_parameters,
    get_fsdp2_grad_scaler,
    load_fsdp_model,
    load_fsdp_optimizer,
    merge_fsdp_weights,
    save_fsdp_model,
    save_fsdp_optimizer,
)
from .launch import (
    PrepareForLaunch,
    _filter_args,
    prepare_deepspeed_cmd_env,
    prepare_multi_gpu_env,
    prepare_sagemager_args_inputs,
    prepare_simple_launcher_cmd_env,
    prepare_tpu,
)

# For docs
from .megatron_lm import (
    AbstractTrainStep,
    BertTrainStep,
    GPTTrainStep,
    MegatronLMDummyDataLoader,
    MegatronLMDummyScheduler,
    T5TrainStep,
    avg_losses_across_data_parallel_group,
)


if is_megatron_lm_available():
    from .megatron_lm import (
        MegatronEngine,
        MegatronLMOptimizerWrapper,
        MegatronLMSchedulerWrapper,
        gather_across_data_parallel_groups,
    )
    from .megatron_lm import initialize as megatron_lm_initialize
    from .megatron_lm import prepare_data_loader as megatron_lm_prepare_data_loader
    from .megatron_lm import prepare_model_optimizer_scheduler as megatron_lm_prepare_model_optimizer_scheduler
    from .megatron_lm import prepare_optimizer as megatron_lm_prepare_optimizer
    from .megatron_lm import prepare_scheduler as megatron_lm_prepare_scheduler
from .memory import find_executable_batch_size, release_memory
from .other import (
    check_os_kernel,
    clean_state_dict_for_safetensors,
    compile_regions,
    compile_regions_deepspeed,
    convert_bytes,
    extract_model_from_parallel,
    get_module_children_bottom_up,
    get_pretty_name,
    has_compiled_regions,
    is_compiled_module,
    is_port_in_use,
    load,
    merge_dicts,
    model_has_dtensor,
    recursive_getattr,
    save,
    wait_for_everyone,
    write_basic_config,
)
from .random import set_seed, synchronize_rng_state, synchronize_rng_states
from .torch_xla import install_xla
from .tqdm import tqdm
from .transformer_engine import (
    apply_fp8_autowrap,
    contextual_fp8_autocast,
    convert_model,
    has_transformer_engine_layers,
)


================================================
FILE: src/accelerate/utils/ao.py
================================================
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Needed utilities for torchao FP8 training.
"""

from functools import partial
from typing import TYPE_CHECKING, Callable, Optional

import torch

from .imports import is_torchao_available, torchao_required


if TYPE_CHECKING:
    if is_torchao_available():
        from torchao.float8.float8_linear import Float8LinearConfig


def find_first_last_linear_layers(model: torch.nn.Module):
    """
    Finds the first and last linear layer names in a model.

    This is needed during FP8 to avoid issues with instability by keeping the first and last layers unquantized.

    Ref: https://x.com/xariusrke/status/1826669142604141052
    """
    first_linear, last_linear = None, None
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            if first_linear is None:
                first_linear = name
            last_linear = name
    return first_linear, last_linear


def filter_linear_layers(module, fqn: str, layers_to_filter: list[str]) -> bool:
    """
    A function which will check if `module` is:
    - a `torch.nn.Linear` layer
    - has in_features and out_features divisible by 16
    - is not part of `layers_to_filter`

    Args:
        module (`torch.nn.Module`):
            The module to check.
        fqn (`str`):
            The fully qualified name of the layer.
        layers_to_filter (`List[str]`):
            The list of layers to filter.
    """
    if isinstance(module, torch.nn.Linear):
        if module.in_features % 16 != 0 or module.out_features % 16 != 0:
            return False
    if fqn in layers_to_filter:
        return False
    return True


def filter_first_and_last_linear_layers(module, fqn: str) -> bool:
    """
    A filter function which will filter out all linear layers except the first and last.

    <Tip>

        For stability reasons, we skip the first and last linear layers Otherwise can lead to the model not training or
        converging properly

    </Tip>

    Args:
        module (`torch.nn.Module`):
            The module to check.
        fqn (`str`):
            The fully qualified name of the layer.
    """
    first_linear, last_linear = find_first_last_linear_layers(module)
    return filter_linear_layers(module, fqn, layers_to_filter=[first_linear, last_linear])


@torchao_required
def has_ao_layers(model: torch.nn.Module):
    from torchao.float8.float8_linear import Float8Linear

    for name, module in model.named_modules():
        if isinstance(module, Float8Linear):
            return True
    return False


@torchao_required
def convert_model_to_fp8_ao(
    model: torch.nn.Module,
    config: Optional["Float8LinearConfig"] = None,
    module_filter_func: Optional[Callable] = filter_first_and_last_linear_layers,
):
    """
    Converts all `nn.Linear` layers in the model (except the first and last) to torchao's `Float8Linear` layer inplace.

    Args:
        model (`torch.nn.Module`):
            The model to convert.
        config (`torchao.float8.Float8LinearConfig`, *optional*):
            The configuration for the FP8 training. Recommended to utilize
            `torchao.float8.recipe_name_to_linear_config` to generate this. In general, the default config should be
            sufficient (what is passed when set to `None`).
        module_filter_func (`Callable`, *optional*, defaults to `filter_linear_layers`):
            Optional function that must take in a module and layer name, and returns a boolean indicating whether the
            module should be converted to FP8. Defaults to `filter_linear_layers`. See it for an example.

    Example:

    ```python
    from accelerate.utils.ao import convert_model_to_fp8_ao
    from accelerate import Accelerator

    accelerator = Accelerator(

    model = MyModel()
    model.to(accelerator.device)
    convert_to_float8_training(model)

    model.train()
    ```
    """
    from torchao.float8 import convert_to_float8_training

    first_linear, last_linear = find_first_last_linear_layers(model)
    if module_filter_func is None:
        module_filter_func = partial(filter_linear_layers, layers_to_filter=[first_linear, last_linear])
    convert_to_float8_training(model, module_filter_fn=module_filter_func, config=config)


================================================
FILE: src/accelerate/utils/bnb.py
================================================
# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import logging
import os
from copy import deepcopy
from typing import Optional, Union

import torch
import torch.nn as nn

from accelerate.utils.imports import (
    is_4bit_bnb_available,
    is_8bit_bnb_available,
)

from ..big_modeling import dispatch_model, init_empty_weights
from .dataclasses import BnbQuantizationConfig
from .modeling import (
    find_tied_parameters,
    get_balanced_memory,
    infer_auto_device_map,
    load_checkpoint_in_model,
    offload_weight,
    set_module_tensor_to_device,
)


logger = logging.getLogger(__name__)


def load_and_quantize_model(
    model: torch.nn.Module,
    bnb_quantization_config: BnbQuantizationConfig,
    weights_location: Optional[Union[str, os.PathLike]] = None,
    device_map: Optional[dict[str, Union[int, str, torch.device]]] = None,
    no_split_module_classes: Optional[list[str]] = None,
    max_memory: Optional[dict[Union[int, str], Union[int, str]]] = None,
    offload_folder: Optional[Union[str, os.PathLike]] = None,
    offload_state_dict: bool = False,
):
    """
    This function will quantize the input model with the associated config passed in `bnb_quantization_config`. If the
    model is in the meta device, we will load and dispatch the weights according to the `device_map` passed. If the
    model is already loaded, we will quantize the model and put the model on the GPU,

    Args:
        model (`torch.nn.Module`):
            Input model. The model can be already loaded or on the meta device
        bnb_quantization_config (`BnbQuantizationConfig`):
            The bitsandbytes quantization parameters
        weights_location (`str` or `os.PathLike`):
            The folder weights_location to load. It can be:
            - a path to a file containing a whole model state dict
            - a path to a `.json` file containing the index to a sharded checkpoint
            - a path to a folder containing a unique `.index.json` file and the shards of a checkpoint.
            - a path to a folder containing a unique pytorch_model.bin file.
        device_map (`Dict[str, Union[int, str, torch.device]]`, *optional*):
            A map that specifies where each submodule should go. It doesn't need to be refined to each parameter/buffer
            name, once a given module name is inside, every submodule of it will be sent to the same device.
        no_split_module_classes (`List[str]`, *optional*):
            A list of layer class names that should never be split across device (for instance any layer that has a
            residual connection).
        max_memory (`Dict`, *optional*):
            A dictionary device identifier to maximum memory. Will default to the maximum memory available if unset.
        offload_folder (`str` or `os.PathLike`, *optional*):
            If the `device_map` contains any value `"disk"`, the folder where we will offload weights.
        offload_state_dict (`bool`, *optional*, defaults to `False`):
            If `True`, will temporarily offload the CPU state dict on the hard drive to avoid getting out of CPU RAM if
            the weight of the CPU state dict + the biggest shard does not fit.

    Returns:
        `torch.nn.Module`: The quantized model
    """

    load_in_4bit = bnb_quantization_config.load_in_4bit
    load_in_8bit = bnb_quantization_config.load_in_8bit

    if load_in_8bit and not is_8bit_bnb_available():
        raise ImportError(
            "You have a version of `bitsandbytes` that is not compatible with 8bit quantization,"
            " make sure you have the latest version of `bitsandbytes` installed."
        )
    if load_in_4bit and not is_4bit_bnb_available():
        raise ValueError(
            "You have a version of `bitsandbytes` that is not compatible with 4bit quantization,"
            "make sure you have the latest version of `bitsandbytes` installed."
        )

    modules_on_cpu = []
    # custom device map
    if isinstance(device_map, dict) and len(device_map.keys()) > 1:
        modules_on_cpu = [key for key, value in device_map.items() if value in ["disk", "cpu"]]

    # We keep some modules such as the lm_head in their original dtype for numerical stability reasons
    if bnb_quantization_config.skip_modules is None:
        bnb_quantization_config.skip_modules = get_keys_to_not_convert(model)

    # add cpu modules to skip modules only for 4-bit modules
    if load_in_4bit:
        bnb_quantization_config.skip_modules.extend(modules_on_cpu)
    modules_to_not_convert = bnb_quantization_config.skip_modules

    # We add the modules we want to keep in full precision
    if bnb_quantization_config.keep_in_fp32_modules is None:
        bnb_quantization_config.keep_in_fp32_modules = []
    keep_in_fp32_modules = bnb_quantization_config.keep_in_fp32_modules
    modules_to_not_convert.extend(keep_in_fp32_modules)

    # compatibility with peft
    model.is_loaded_in_4bit = load_in_4bit
    model.is_loaded_in_8bit = load_in_8bit

    model_device = get_parameter_device(model)
    if model_device.type != "meta":
        # quantization of an already loaded model
        logger.warning(
            "It is not recommended to quantize a loaded model. "
            "The model should be instantiated under the `init_empty_weights` context manager."
        )
        model = replace_with_bnb_layers(model, bnb_quantization_config, modules_to_not_convert=modules_to_not_convert)
        # convert param to the right dtype
        dtype = bnb_quantization_config.torch_dtype
        for name, param in model.named_parameters():
            if any(module_to_keep_in_fp32 in name for module_to_keep_in_fp32 in keep_in_fp32_modules):
                param.data = param.data.to(torch.float32)
            elif torch.is_floating_point(param):
                param.data = param.data.to(dtype)
        if model_device.type == "cuda":
            model.cuda(torch.cuda.current_device())
            torch.cuda.empty_cache()
        elif torch.cuda.is_available():
            model.to(torch.cuda.current_device())
        elif torch.xpu.is_available():
            model.to(torch.xpu.current_device())
        else:
            raise RuntimeError("No GPU or Intel XPU found. A GPU or Intel XPU is needed for quantization.")
        logger.info(
            f"The model device type is {model_device.type}. However, gpu or intel xpu is needed for quantization."
            "We move the model to it."
        )
        return model

    elif weights_location is None:
        raise RuntimeError(
            f"`weights_location` needs to be the folder path containing the weights of the model, but we found {weights_location} "
        )

    else:
        with init_empty_weights():
            model = replace_with_bnb_layers(
                model, bnb_quantization_config, modules_to_not_convert=modules_to_not_convert
            )
        device_map = get_quantized_model_device_map(
            model,
            bnb_quantization_config,
            device_map,
            max_memory=max_memory,
            no_split_module_classes=no_split_module_classes,
        )
        if offload_state_dict is None and device_map is not None and "disk" in device_map.values():
            offload_state_dict = True

        offload = any(x in list(device_map.values()) for x in ["cpu", "disk"])

        load_checkpoint_in_model(
            model,
            weights_location,
            device_map,
            dtype=bnb_quantization_config.torch_dtype,
            offload_folder=offload_folder,
            offload_state_dict=offload_state_dict,
            keep_in_fp32_modules=bnb_quantization_config.keep_in_fp32_modules,
            offload_8bit_bnb=load_in_8bit and offload,
        )
        return dispatch_model(model, device_map=device_map, offload_dir=offload_folder)


def get_quantized_model_device_map(
    model, bnb_quantization_config, device_map=None, max_memory=None, no_split_module_classes=None
):
    if device_map is None:
        if torch.cuda.is_available():
            device_map = {"": torch.cuda.current_device()}
        elif torch.xpu.is_available():
            device_map = {"": torch.xpu.current_device()}
        else:
            raise RuntimeError("No GPU found. A GPU is needed for quantization.")
        logger.info("The device_map was not initialized.Setting device_map to `{'':torch.cuda.current_device()}`.")

    if isinstance(device_map, str):
        if device_map not in ["auto", "balanced", "balanced_low_0", "sequential"]:
            raise ValueError(
                "If passing a string for `device_map`, please choose 'auto', 'balanced', 'balanced_low_0' or "
                "'sequential'."
            )

        special_dtypes = {}
        special_dtypes.update(
            {
                name: bnb_quantization_config.torch_dtype
                for name, _ in model.named_parameters()
                if any(m in name for m in bnb_quantization_config.skip_modules)
            }
        )
        special_dtypes.update(
            {
                name: torch.float32
                for name, _ in model.named_parameters()
                if any(m in name for m in bnb_quantization_config.keep_in_fp32_modules)
            }
        )

        kwargs = {}
        kwargs["special_dtypes"] = special_dtypes
        kwargs["no_split_module_classes"] = no_split_module_classes
        kwargs["dtype"] = bnb_quantization_config.target_dtype

        # get max_memory for each device.
        if device_map != "sequential":
            max_memory = get_balanced_memory(
                model,
                low_zero=(device_map == "balanced_low_0"),
                max_memory=max_memory,
                **kwargs,
            )

        kwargs["max_memory"] = max_memory
        device_map = infer_auto_device_map(model, **kwargs)

    if isinstance(device_map, dict):
        # check if don't have any quantized module on the cpu
        modules_not_to_convert = bnb_quantization_config.skip_modules + bnb_quantization_config.keep_in_fp32_modules

        device_map_without_some_modules = {
            key: device_map[key] for key in device_map.keys() if key not in modules_not_to_convert
        }
        for device in ["cpu", "disk"]:
            if device in device_map_without_some_modules.values():
                if bnb_quantization_config.load_in_4bit:
                    raise ValueError(
                        """
                        Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit
                        the quantized model. If you want to dispatch the model on the CPU or the disk while keeping
                        these modules in `torch_dtype`, you need to pass a custom `device_map` to
                        `load_and_quantize_model`. Check
                        https://huggingface.co/docs/accelerate/main/en/usage_guides/quantization#offload-modules-to-cpu-and-disk
                        for more details.
                        """
                    )
                else:
                    logger.info(
                        "Some modules are are offloaded to the CPU or the disk. Note that these modules will be converted to 8-bit"
                    )
        del device_map_without_some_modules
    return device_map


def replace_with_bnb_layers(model, bnb_quantization_config, modules_to_not_convert=None, current_key_name=None):
    """
    A helper function to replace all `torch.nn.Linear` modules by `bnb.nn.Linear8bit` modules or by `bnb.nn.Linear4bit`
    modules from the `bitsandbytes`library. The function will be run recursively and replace `torch.nn.Linear` modules.

    Parameters:
        model (`torch.nn.Module`):
            Input model or `torch.nn.Module` as the function is run recursively.
        modules_to_not_convert (`List[str]`):
            Names of the modules to not quantize convert. In practice we keep the `lm_head` in full precision for
            numerical stability reasons.
        current_key_name (`List[str]`, *optional*):
            An array to track the current key of the recursion. This is used to check whether the current key (part of
            it) is not in the list of modules to not convert.
    """

    if modules_to_not_convert is None:
        modules_to_not_convert = []

    model, has_been_replaced = _replace_with_bnb_layers(
        model, bnb_quantization_config, modules_to_not_convert, current_key_name
    )
    if not has_been_replaced:
        logger.warning(
            "You are loading your model in 8bit or 4bit but no linear modules were found in your model."
            " this can happen for some architectures such as gpt2 that uses Conv1D instead of Linear layers."
            " Please double check your model architecture, or submit an issue on github if you think this is"
            " a bug."
        )
    return model


def _replace_with_bnb_layers(
    model,
    bnb_quantization_config,
    modules_to_not_convert=None,
    current_key_name=None,
):
    """
    Private method that wraps the recursion for module replacement.

    Returns the converted model and a boolean that indicates if the conversion has been successful or not.
    """
    # bitsandbytes will initialize device(e.g. CUDA, XPU) on import, so it needs to be imported lazily
    import bitsandbytes as bnb

    has_been_replaced = False
    for name, module in model.named_children():
        if current_key_name is None:
            current_key_name = []
        current_key_name.append(name)
        if isinstance(module, nn.Linear) and name not in modules_to_not_convert:
            # Check if the current key is not in the `modules_to_not_convert`
            current_key_name_str = ".".join(current_key_name)
            proceed = True
            for key in modules_to_not_convert:
                if (
                    (key in current_key_name_str) and (key + "." in current_key_name_str)
                ) or key == current_key_name_str:
                    proceed = False
                    break
            if proceed:
                # Load bnb module with empty weight and replace ``nn.Linear` module
                if bnb_quantization_config.load_in_8bit:
                    bnb_module = bnb.nn.Linear8bitLt(
                        module.in_features,
                        module.out_features,
                        module.bias is not None,
                        has_fp16_weights=False,
                        threshold=bnb_quantization_config.llm_int8_threshold,
                    )
                elif bnb_quantization_config.load_in_4bit:
                    bnb_module = bnb.nn.Linear4bit(
                        module.in_features,
                        module.out_features,
                        module.bias is not None,
                        bnb_quantization_config.bnb_4bit_compute_dtype,
                        compress_statistics=bnb_quantization_config.bnb_4bit_use_double_quant,
                        quant_type=bnb_quantization_config.bnb_4bit_quant_type,
                    )
                else:
                    raise ValueError("load_in_8bit and load_in_4bit can't be both False")
                bnb_module.weight.data = module.weight.data
                if module.bias is not None:
                    bnb_module.bias.data = module.bias.data
                bnb_module.requires_grad_(False)
                setattr(model, name, bnb_module)
                has_been_replaced = True
        if len(list(module.children())) > 0:
            _, _has_been_replaced = _replace_with_bnb_layers(
                module, bnb_quantization_config, modules_to_not_convert, current_key_name
            )
            has_been_replaced = has_been_replaced | _has_been_replaced
        # Remove the last key for recursion
        current_key_name.pop(-1)
    return model, has_been_replaced


def get_keys_to_not_convert(model):
    r"""
    An utility function to get the key of the module to keep in full precision if any For example for CausalLM modules
    we may want to keep the lm_head in full precision for numerical stability reasons. For other architectures, we want
    to keep the tied weights of the model. The function will return a list of the keys of the modules to not convert in
    int8.

    Parameters:
    model (`torch.nn.Module`):
        Input model
    """
    # Create a copy of the model
    with init_empty_weights():
        tied_model = deepcopy(model)  # this has 0 cost since it is done inside `init_empty_weights` context manager`

    tied_params = find_tied_parameters(tied_model)
    # For compatibility with Accelerate < 0.18
    if isinstance(tied_params, dict):
        tied_keys = sum(list(tied_params.values()), []) + list(tied_params.keys())
    else:
        tied_keys = sum(tied_params, [])
    has_tied_params = len(tied_keys) > 0

    # Check if it is a base model
    is_base_model = False
    if hasattr(model, "base_model_prefix"):
        is_base_model = not hasattr(model, model.base_model_prefix)

    # Ignore this for base models (BertModel, GPT2Model, etc.)
    if (not has_tied_params) and is_base_model:
        return []

    # otherwise they have an attached head
    list_modules = list(model.named_children())
    list_last_module = [list_modules[-1][0]]

    # add last module together with tied weights
    intersection = set(list_last_module) - set(tied_keys)
    list_untouched = list(set(tied_keys)) + list(intersection)

    # remove ".weight" from the keys
    names_to_remove = [".weight", ".bias"]
    filtered_module_names = []
    for name in list_untouched:
        for name_to_remove in names_to_remove:
            if name_to_remove in name:
                name = name.replace(name_to_remove, "")
        filtered_module_names.append(name)

    return filtered_module_names


def has_4bit_bnb_layers(model):
    """Check if we have `bnb.nn.Linear4bit` or `bnb.nn.Linear8bitLt` layers inside our model"""
    # bitsandbytes will initialize device(e.g. CUDA, XPU) on import, so it needs to be imported lazily
    import bitsandbytes as bnb

    for m in model.modules():
        if isinstance(m, bnb.nn.Linear4bit):
            return True
    return False


def get_parameter_device(parameter: nn.Module):
    return next(parameter.parameters()).device


def quantize_and_offload_8bit(model, param, param_name, new_dtype, offload_folder, offload_index, fp16_statistics):
    # if it is not quantized, we quantize and offload the quantized weights and the SCB stats
    if fp16_statistics is None:
        set_module_tensor_to_device(model, param_name, 0, dtype=new_dtype, value=param)
        tensor_name = param_name
        module = model
        if "." in tensor_name:
            splits = tensor_name.split(".")
            for split in splits[:-1]:
                new_module = getattr(module, split)
                if new_module is None:
                    raise ValueError(f"{module} has no attribute {split}.")
                module = new_module
            tensor_name = splits[-1]
        # offload weights
        module._parameters[tensor_name].requires_grad = False
        offload_weight(module._parameters[tensor_name], param_name, offload_folder, index=offload_index)
        if hasattr(module._parameters[tensor_name], "SCB"):
            offload_weight(
                module._parameters[tensor_name].SCB,
                param_name.replace("weight", "SCB"),
                offload_folder,
                index=offload_index,
            )
    else:
        offload_weight(param, param_name, offload_folder, index=offload_index)
        offload_weight(fp16_statistics, param_name.replace("weight", "SCB"), offload_folder, index=offload_index)

    set_module_tensor_to_device(model, param_name, "meta", dtype=new_dtype, value=torch.empty(*param.size()))


================================================
FILE: src/accelerate/utils/constants.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import operator as op

import torch


SCALER_NAME = "scaler.pt"
MODEL_NAME = "pytorch_model"
SAFE_MODEL_NAME = "model"
RNG_STATE_NAME = "random_states"
OPTIMIZER_NAME = "optimizer"
SCHEDULER_NAME = "scheduler"
SAMPLER_NAME = "sampler"
PROFILE_PATTERN_NAME = "profile_{suffix}.json"
WEIGHTS_NAME = f"{MODEL_NAME}.bin"
WEIGHTS_PATTERN_NAME = "pytorch_model{suffix}.bin"
WEIGHTS_INDEX_NAME = f"{WEIGHTS_NAME}.index.json"
SAFE_WEIGHTS_NAME = f"{SAFE_MODEL_NAME}.safetensors"
SAFE_WEIGHTS_PATTERN_NAME = "model{suffix}.safetensors"
SAFE_WEIGHTS_INDEX_NAME = f"{SAFE_WEIGHTS_NAME}.index.json"
SAGEMAKER_PYTORCH_VERSION = "1.10.2"
SAGEMAKER_PYTHON_VERSION = "py38"
SAGEMAKER_TRANSFORMERS_VERSION = "4.17.0"
SAGEMAKER_PARALLEL_EC2_INSTANCES = ["ml.p3.16xlarge", "ml.p3dn.24xlarge", "ml.p4dn.24xlarge"]
FSDP_SHARDING_STRATEGY = ["FULL_SHARD", "SHARD_GRAD_OP", "NO_SHARD", "HYBRID_SHARD", "HYBRID_SHARD_ZERO2"]
FSDP_AUTO_WRAP_POLICY = ["TRANSFORMER_BASED_WRAP", "SIZE_BASED_WRAP", "NO_WRAP"]
FSDP_BACKWARD_PREFETCH = ["BACKWARD_PRE", "BACKWARD_POST", "NO_PREFETCH"]
FSDP_STATE_DICT_TYPE = ["FULL_STATE_DICT", "LOCAL_STATE_DICT", "SHARDED_STATE_DICT"]
FSDP2_STATE_DICT_TYPE = ["SHARDED_STATE_DICT", "FULL_STATE_DICT"]
FSDP_PYTORCH_VERSION = (
    "2.1.0.a0+32f93b1"  # Technically should be 2.1.0, but MS-AMP uses this specific prerelease in their Docker image.
)
FSDP2_PYTORCH_VERSION = "2.6.0"
DTENSOR_PYTORCH_VERSION = "2.5.0"
FSDP_MODEL_NAME = "pytorch_model_fsdp"
DEEPSPEED_MULTINODE_LAUNCHERS = ["pdsh", "standard", "openmpi", "mvapich", "mpich", "nossh", "slurm"]
TORCH_DYNAMO_MODES = ["default", "reduce-overhead", "max-autotune"]
ELASTIC_LOG_LINE_PREFIX_TEMPLATE_PYTORCH_VERSION = "2.2.0"
XPU_PROFILING_AVAILABLE_PYTORCH_VERSION = "2.4.0"
MITA_PROFILING_AVAILABLE_PYTORCH_VERSION = "2.1.0"
BETA_TP_AVAILABLE_PYTORCH_VERSION = "2.3.0"

BETA_TP_AVAILABLE_TRANSFORMERS_VERSION = "4.52.0"
BETA_CP_AVAILABLE_PYTORCH_VERSION = "2.6.0"
BETA_SP_AVAILABLE_DEEPSPEED_VERSION = "0.18.2"

STR_OPERATION_TO_FUNC = {">": op.gt, ">=": op.ge, "==": op.eq, "!=": op.ne, "<=": op.le, "<": op.lt}

# These are the args for `torch.distributed.launch` for pytorch < 1.9
TORCH_LAUNCH_PARAMS = [
    "nnodes",
    "nproc_per_node",
    "rdzv_backend",
    "rdzv_endpoint",
    "rdzv_id",
    "rdzv_conf",
    "standalone",
    "max_restarts",
    "monitor_interval",
    "start_method",
    "role",
    "module",
    "m",
    "no_python",
    "run_path",
    "log_dir",
    "r",
    "redirects",
    "t",
    "tee",
    "node_rank",
    "master_addr",
    "master_port",
]

CUDA_DISTRIBUTED_TYPES = ["DEEPSPEED", "MULTI_GPU", "FSDP", "MEGATRON_LM", "TP"]
TORCH_DISTRIBUTED_OPERATION_TYPES = CUDA_DISTRIBUTED_TYPES + [
    "MULTI_NPU",
    "MULTI_MLU",
    "MULTI_SDAA",
    "MULTI_MUSA",
    "MULTI_XPU",
    "MULTI_CPU",
    "MULTI_HPU",
    "MULTI_NEURON",
]
SUPPORTED_PYTORCH_LAYERS_FOR_UPCASTING = (
    torch.nn.Conv1d,
    torch.nn.Conv2d,
    torch.nn.Conv3d,
    torch.nn.ConvTranspose1d,
    torch.nn.ConvTranspose2d,
    torch.nn.ConvTranspose3d,
    torch.nn.Linear,
)


================================================
FILE: src/accelerate/utils/dataclasses.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
General namespace and dataclass related classes
"""

import argparse
import copy
import enum
import functools
import logging
import os
import warnings
from collections.abc import Iterable
from contextlib import contextmanager
from dataclasses import dataclass, field
from datetime import timedelta
from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Union, get_args

import torch

from .constants import (
    BETA_CP_AVAILABLE_PYTORCH_VERSION,
    BETA_TP_AVAILABLE_PYTORCH_VERSION,
    BETA_TP_AVAILABLE_TRANSFORMERS_VERSION,
    FSDP2_PYTORCH_VERSION,
    FSDP_AUTO_WRAP_POLICY,
    FSDP_BACKWARD_PREFETCH,
    FSDP_SHARDING_STRATEGY,
    MITA_PROFILING_AVAILABLE_PYTORCH_VERSION,
    XPU_PROFILING_AVAILABLE_PYTORCH_VERSION,
)
from .environment import parse_flag_from_env, str_to_bool
from .imports import (
    is_cuda_available,
    is_hpu_available,
    is_mlu_available,
    is_msamp_available,
    is_musa_available,
    is_npu_available,
    is_torchao_available,
    is_transformer_engine_available,
    is_xpu_available,
)
from .versions import compare_versions, is_torch_version


if TYPE_CHECKING:
    # Mock imports for type checking
    from torchao.float8 import Float8LinearConfig


logger = logging.getLogger(__name__)


class KwargsHandler:
    """
    Internal mixin that implements a `to_kwargs()` method for a dataclass.
    """

    def to_dict(self):
        return copy.deepcopy(self.__dict__)

    def to_kwargs(self):
        """
        Returns a dictionary containing the attributes with values different from the default of this class.
        """
        # import clear_environment here to avoid circular import problem
        from .environment import clear_environment

        with clear_environment():
            default_dict = self.__class__().to_dict()
        this_dict = self.to_dict()
        return {k: v for k, v in this_dict.items() if default_dict[k] != v}


class EnumWithContains(enum.EnumMeta):
    "A metaclass that adds the ability to check if `self` contains an item with the `in` operator"

    def __contains__(cls, item):
        try:
            cls(item)
        except ValueError:
            return False
        return True


class BaseEnum(enum.Enum, metaclass=EnumWithContains):
    "An enum class that can get the value of an item with `str(Enum.key)`"

    def __str__(self):
        return self.value

    @classmethod
    def list(cls):
        "Method to list all the possible items in `cls`"
        return list(map(str, cls))


@dataclass
class AutocastKwargs(KwargsHandler):
    """
    Use this object in your [`Accelerator`] to customize how `torch.autocast` behaves. Please refer to the
    documentation of this [context manager](https://pytorch.org/docs/stable/amp.html#torch.autocast) for more
    information on each argument.

    Example:

    ```python
    from accelerate import Accelerator
    from accelerate.utils import AutocastKwargs

    kwargs = AutocastKwargs(cache_enabled=True)
    accelerator = Accelerator(kwargs_handlers=[kwargs])
    ```
    """

    enabled: bool = True
    cache_enabled: Optional[bool] = None


class DDPCommunicationHookType(BaseEnum):
    """
    Represents a type of communication hook used in DDP.

    Values:

        - **NO** -- no communication hook
        - **FP16** -- DDP communication hook to compress the gradients in FP16
        - **BF16** -- DDP communication hook to compress the gradients in BF16
        - **POWER_SGD** -- DDP communication hook to use PowerSGD
        - **BATCHED_POWER_SGD** -- DDP communication hook to use batched PowerSGD
    """

    NO = "no"
    FP16 = "fp16"
    BF16 = "bf16"
    POWER_SGD = "power_sgd"
    BATCHED_POWER_SGD = "batched_power_sgd"


@dataclass
class DistributedDataParallelKwargs(KwargsHandler):
    """
    Use this object in your [`Accelerator`] to customize how your model is wrapped in a
    `torch.nn.parallel.DistributedDataParallel`. Please refer to the documentation of this
    [wrapper](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) for more
    information on each argument.

    <Tip warning={true}>

    `gradient_as_bucket_view` is only available in PyTorch 1.7.0 and later versions.

    `static_graph` is only available in PyTorch 1.11.0 and later versions.

    </Tip>

    Example:

    ```python
    from accelerate import Accelerator
    from accelerate.utils import DistributedDataParallelKwargs

    kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
    accelerator = Accelerator(kwargs_handlers=[kwargs])
    ```
    """

    dim: int = 0
    broadcast_buffers: bool = True
    bucket_cap_mb: int = 25
    find_unused_parameters: bool = False
    check_reduction: bool = False
    gradient_as_bucket_view: bool = False
    static_graph: bool = False

    comm_hook: DDPCommunicationHookType = DDPCommunicationHookType.NO
    comm_wrapper: Literal[
        DDPCommunicationHookType.NO,
        DDPCommunicationHookType.FP16,
        DDPCommunicationHookType.BF16,
    ] = DDPCommunicationHookType.NO
    comm_state_option: dict = field(default_factory=dict)

    def to_dict(self, ignore_keys=("comm_hook", "comm_wrapper", "comm_state_option")):
        return {k: v for k, v in super().to_dict().items() if k not in ignore_keys}

    def register_comm_hook(self, model):
        from torch.distributed.algorithms.ddp_comm_hooks import (
            default_hooks,
            powerSGD_hook,
        )

        hook_map: dict[DDPCommunicationHookType, Callable] = {
            DDPCommunicationHookType.FP16: default_hooks.fp16_compress_hook,
            DDPCommunicationHookType.BF16: default_hooks.bf16_compress_hook,
            DDPCommunicationHookType.POWER_SGD: powerSGD_hook.powerSGD_hook,
            DDPCommunicationHookType.BATCHED_POWER_SGD: powerSGD_hook.batched_powerSGD_hook,
        }

        wrapper_map: dict[DDPCommunicationHookType, Callable] = {
            DDPCommunicationHookType.FP16: default_hooks.fp16_compress_wrapper,
            DDPCommunicationHookType.BF16: default_hooks.bf16_compress_wrapper,
        }

        hook: Optional[Callable] = hook_map.get(self.comm_hook)
        wrapper: Optional[Callable] = wrapper_map.get(self.comm_wrapper)

        if hook and wrapper:
            hook = wrapper(hook)

        if hook:
            state = (
                powerSGD_hook.PowerSGDState(None, **self.comm_state_option)
                if self.comm_hook
                in (
                    DDPCommunicationHookType.POWER_SGD,
                    DDPCommunicationHookType.BATCHED_POWER_SGD,
                )
                else None
            )
            model.register_comm_hook(
                state=state,
                hook=hook,
            )


@dataclass
class GradScalerKwargs(KwargsHandler):
    """
    Use this object in your [`Accelerator`] to customize the behavior of mixed precision, specifically how the
    `torch.amp.GradScaler` or `torch.cuda.amp.GradScaler` used is created. Please refer to the documentation of this
    [scaler](https://pytorch.org/docs/stable/amp.html?highlight=gradscaler) for more information on each argument.

    <Tip warning={true}>

    `torch.cuda.amp.GradScaler` is only available in PyTorch 1.5.0 and later versions, and `torch.amp.GradScaler` is
    only available in PyTorch 2.4.0 and later versions.

    </Tip>

    Example:

    ```python
    from accelerate import Accelerator
    from accelerate.utils import GradScalerKwargs

    kwargs = GradScalerKwargs(backoff_factor=0.25)
    accelerator = Accelerator(kwargs_handlers=[kwargs])
    ```
    """

    init_scale: float = 65536.0
    growth_factor: float = 2.0
    backoff_factor: float = 0.5
    growth_interval: int = 2000
    enabled: bool = True


@dataclass
class InitProcessGroupKwargs(KwargsHandler):
    """
    Use this object in your [`Accelerator`] to customize the initialization of the distributed processes. Please refer
    to the documentation of this
    [method](https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group) for more
    information on each argument.

    Note: If `timeout` is set to `None`, the default will be based upon how `backend` is set.

    ```python
    from datetime import timedelta
    from accelerate import Accelerator
    from accelerate.utils import InitProcessGroupKwargs

    kwargs = InitProcessGroupKwargs(timeout=timedelta(seconds=800))
    accelerator = Accelerator(kwargs_handlers=[kwargs])
    ```
    """

    backend: Optional[str] = "nccl"
    init_method: Optional[str] = None
    timeout: Optional[timedelta] = None

    def __post_init__(self):
        if self.timeout is None:
            seconds = 1800 if self.backend != "nccl" else 600
            self.timeout = timedelta(seconds=seconds)


# Literals
Backend = Literal["MSAMP", "TE"]
OptLevel = Literal["O1", "O2"]
FP8Format = Literal["HYBRID", "E4M3", "E5M2"]
AmaxComputeAlgorithm = Literal["max", "most_recent"]


# FP8 training recipe kwargs
@dataclass
class AORecipeKwargs(KwargsHandler):
    """
    Use this object in your [`Accelerator`] to customize the initialization of the recipe for FP8 mixed precision
    training with `torchao` FP8.

    Args:
        config (`torchao.float8.Float8LinearConfig`, *optional*, default to `None`):
            The configuration for the FP8 training. If `None`, a default config will be created with sensible
            defaults for most use cases:
            - `pad_inner_dim=True`: Pads matrix dimensions to be divisible by 16, required for `torch._scaled_mm`
              operations to prevent runtime errors.
            - `enable_fsdp_float8_all_gather=True`: Enables FP8 all-gather for FSDP2. This provides memory bandwidth
              savings by casting parameters before the all-gather operation, saving 50% bandwidth compared to BF16.

            You can override these defaults by providing your own `Float8LinearConfig` instance.
        module_filter_func (`Callable`, *optional*, default to `None`):
            Optional function that must take in a module and layer name, and returns a boolean indicating whether the
            module should be converted to FP8. Defaults to `accelerate.utils.ao.filter_linear_layers`. See it for an
            example.
    """

    config: Optional["Float8LinearConfig"] = None
    module_filter_func: Optional[Callable] = None
    pad_inner_dim: Optional[bool] = None
    enable_fsdp_float8_all_gather: Optional[bool] = None

    def __post_init__(self):
        env_prefix = "ACCELERATE_FP8_"
        if not is_torchao_available():
            raise ImportError("TorchAO is not available. Please install it or use a different backend.")

        if self.config is None:
            from torchao.float8 import Float8LinearConfig

            # Check environment variables for overrides
            if self.pad_inner_dim is None:
                self.pad_inner_dim = parse_flag_from_env(env_prefix + "PAD_INNER_DIM", default=True)
            if self.enable_fsdp_float8_all_gather is None:
                self.enable_fsdp_float8_all_gather = parse_flag_from_env(
                    env_prefix + "ENABLE_FSDP_FLOAT8_ALL_GATHER", default=True
                )
            self.config = Float8LinearConfig(
                pad_inner_dim=self.pad_inner_dim,
                enable_fsdp_float8_all_gather=self.enable_fsdp_float8_all_gather,
            )


@dataclass
class TERecipeKwargs(KwargsHandler):
    """
    Use this object in your [`Accelerator`] to customize the initialization of the recipe for FP8 mixed precision
    training with `transformer-engine`.

    <Tip>

        For more information on the args, please refer to the API
        [documentation](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/common.html).

    </Tip>

    ```python
    from accelerate import Accelerator
    from accelerate.utils import TERecipeKwargs

    kwargs = TERecipeKwargs(fp8_format="HYBRID")
    accelerator = Accelerator(mixed_precision="fp8", kwargs_handlers=[kwargs])
    ```

    Args:
        use_autocast_during_eval (`bool`, *optional*, default to `False`):
            Whether to use FP8 autocast during eval mode. Generally better metrics are found when this is `False`.
        margin (`int`, *optional*, default to 0):
            The margin to use for the gradient scaling.
        interval (`int`, *optional*, default to 1):
            The interval to use for how often the scaling factor is recomputed.
        fp8_format (`str`, *optional*, default to "HYBRID"):
            The format to use for the FP8 recipe. Must be one of `HYBRID`, `E4M3` or `E5M2`. (Generally `HYBRID` for
            training, `E4M3` or `E5M2` for evaluation)
        amax_history_len (`int`, *optional*, default to 1024):
            The length of the history to use for the scaling factor computation
        amax_compute_algo (`str`, *optional*, default to "most_recent"):
            The algorithm to use for the scaling factor computation. Must be one of `max` or `most_recent`.
        override_linear_precision (`tuple` of three `bool`, *optional*, default to `(False, False, False)`):
            Whether or not to execute `fprop`, `dgrad`, and `wgrad` GEMMS in higher precision.
    """

    use_autocast_during_eval: Optional[bool] = None
    margin: Optional[int] = None
    interval: Optional[int] = None
    fp8_format: FP8Format = None
    amax_history_len: Optional[int] = None
    amax_compute_algo: AmaxComputeAlgorithm = None
    override_linear_precision: tuple[bool, bool, bool] = None
    use_mxfp8_block_scaling: Optional[bool] = None

    def __post_init__(self):
        env_prefix = "ACCELERATE_FP8_"
        if not is_transformer_engine_available():
            raise ImportError("TransformerEngine is not available. Please install it or use a different backend.")
        if self.use_autocast_during_eval is None:
            self.use_autocast_during_eval = parse_flag_from_env(env_prefix + "USE_AUTOCAST_DURING_EVAL")
        if self.margin is None:
            self.margin = int(os.environ.get(env_prefix + "MARGIN", 0))
        if self.interval is None:
            self.interval = int(os.environ.get(env_prefix + "INTERVAL", 1))
        if self.fp8_format is None:
            self.fp8_format = os.environ.get(env_prefix + "FORMAT", "HYBRID")
        self.fp8_format = self.fp8_format.upper()
        if self.fp8_format not in get_args(FP8Format):
            raise ValueError(f"`fp8_format` must be one of {' or '.join(get_args(FP8Format))}.")
        if self.amax_compute_algo is None:
            self.amax_compute_algo = os.environ.get(env_prefix + "AMAX_COMPUTE_ALGO", "most_recent")
        self.amax_compute_algo = self.amax_compute_algo.lower()
        if self.amax_compute_algo not in get_args(AmaxComputeAlgorithm):
            raise ValueError(f"`amax_compute_algo` must be one of {' or '.join(get_args(AmaxComputeAlgorithm))}")
        if self.amax_history_len is None:
            self.amax_history_len = int(os.environ.get(env_prefix + "AMAX_HISTORY_LEN", 1024))
        if self.override_linear_precision is None:
            fprop = parse_flag_from_env(env_prefix + "OVERRIDE_FPROP")
            dgrad = parse_flag_from_env(env_prefix + "OVERRIDE_DGRAD")
            wgrad = parse_flag_from_env(env_prefix + "OVERRIDE_WGRAD")
            self.override_linear_precision = (fprop, dgrad, wgrad)
        if self.use_mxfp8_block_scaling is None:
            self.use_mxfp8_block_scaling = parse_flag_from_env(env_prefix + "USE_MXFP8_BLOCK_SCALING")


@dataclass
class MSAMPRecipeKwargs(KwargsHandler):
    """
    Use this object in your [`Accelerator`] to customize the initialization of the recipe for FP8 mixed precision
    training with `ms-amp`.
    """

    opt_level: OptLevel = None

    def __post_init__(self):
        env_prefix = "ACCELERATE_FP8_"
        if self.opt_level is None:
            self.opt_level = os.environ.get(env_prefix + "OPT_LEVEL", "O2")
        if self.opt_level not in get_args(OptLevel):
            raise ValueError(f"`opt_level` must be one of {' or '.join(get_args(OptLevel))}")


@dataclass
class FP8RecipeKwargs(TERecipeKwargs, MSAMPRecipeKwargs):
    """
    Deprecated. Please use one of the proper FP8 recipe kwargs classes such as `TERecipeKwargs` or `MSAMPRecipeKwargs`
    instead.
    """

    backend: Backend = None

    def __post_init__(self):
        env_prefix = "ACCELERATE_FP8_"
        warnings.warn(
            "FP8RecipeKwargs is deprecated and will be removed in Accelerate v2.0.0. "
            "Please use one of the proper FP8 recipe kwargs classes such as TERecipeKwargs or MSAMPRecipeKwargs instead.",
            FutureWarning,
        )
        default_backend = "msamp" if is_msamp_available() else "te"
        if self.backend is None:
            self.backend = os.environ.get(env_prefix + "BACKEND", default_backend)
        self.backend = self.backend.upper()
        if self.backend not in get_args(Backend):
            raise ValueError("`backend` must be 'MSAMP' or 'TE' (TransformerEngine) to use `FP8RecipeKwargs`.")
        super().__post_init__()


# Literal
ProfilerActivity = Literal["cpu", "xpu", "mtia", "cuda", "hpu"]


@dataclass
class ProfileKwargs(KwargsHandler):
    """
    Use this object in your [`Accelerator`] to customize the initialization of the profiler. Please refer to the
    documentation of this [context manager](https://pytorch.org/docs/stable/profiler.html#torch.profiler.profile) for
    more information on each argument.

    <Tip warning={true}>

    `torch.profiler` is only available in PyTorch 1.8.1 and later versions.

    </Tip>

    Example:

    ```python
    from accelerate import Accelerator
    from accelerate.utils import ProfileKwargs

    kwargs = ProfileKwargs(activities=["cpu", "cuda"])
    accelerator = Accelerator(kwargs_handlers=[kwargs])
    ```

    Args:
        activities (`List[str]`, *optional*, default to `None`):
            The list of activity groups to use in profiling. Must be one of `"cpu"`, `"xpu"`, `"mtia"`, "hpu" or
            `"cuda"`.
        schedule_option (`Dict[str, int]`, *optional*, default to `None`):
            The schedule option to use for the profiler. Available keys are `wait`, `warmup`, `active`, `repeat` and
            `skip_first`. The profiler will skip the first `skip_first` steps, then wait for `wait` steps, then do the
            warmup for the next `warmup` steps, then do the active recording for the next `active` steps and then
            repeat the cycle starting with `wait` steps. The optional number of cycles is specified with the `repeat`
            parameter, the zero value means that the cycles will continue until the profiling is finished.
        on_trace_ready (`Callable`, *optional*, default to `None`):
            Callable that is called at each step when schedule returns `ProfilerAction.RECORD_AND_SAVE` during the
            profiling.
        record_shapes (`bool`, *optional*, default to `False`):
            Save information about operator’s input shapes.
        profile_memory (`bool`, *optional*, default to `False`):
            Track tensor memory allocation/deallocation
        with_stack (`bool`, *optional*, default to `False`):
            Record source information (file and line number) for the ops.
        with_flops (`bool`, *optional*, default to `False`):
            Use formula to estimate the FLOPS of specific operators
        with_modules (`bool`, *optional*, default to `False`):
            Record module hierarchy (including function names) corresponding to the callstack of the op.
        output_trace_dir (`str`, *optional*, default to `None`):
            Exports the collected trace in Chrome JSON format. Chrome use 'chrome://tracing' view json file. Defaults
            to None, which means profiling does not store json files.
    """

    activities: Optional[list[ProfilerActivity]] = None
    schedule_option: Optional[dict[str, int]] = None
    on_trace_ready: Optional[Callable] = None
    record_shapes: bool = False
    profile_memory: bool = False
    with_stack: bool = False
    with_flops: bool = False
    with_modules: bool = False
    output_trace_dir: Optional[str] = None

    def _get_profiler_activity(self, activity: ProfilerActivity) -> torch.profiler.ProfilerActivity:
        """Get the profiler activity from the string.

        Args:
            activity (str): The profiler activity name.

        Returns:
            torch.profiler.ProfilerActivity: The profiler activity.
        """

        profiler_activity_map: dict[str, torch.profiler.ProfilerActivity] = {
            "cpu": torch.profiler.ProfilerActivity.CPU,
            "cuda": torch.profiler.ProfilerActivity.CUDA,
        }

        if is_hpu_available():
            profiler_activity_map["hpu"] = torch.profiler.ProfilerActivity.HPU

        if is_torch_version(">=", XPU_PROFILING_AVAILABLE_PYTORCH_VERSION):
            if torch.xpu.is_available():
                profiler_activity_map["xpu"] = torch.profiler.ProfilerActivity.XPU

        if is_torch_version(">=", MITA_PROFILING_AVAILABLE_PYTORCH_VERSION):
            if torch.mtia.is_available():
                profiler_activity_map["mtia"] = torch.profiler.ProfilerActivity.MTIA

        if activity not in profiler_activity_map:
            raise ValueError(f"Invalid profiler activity: {activity}. Must be one of {list(profiler_activity_map)}.")
        return profiler_activity_map[activity]

    def build(self) -> torch.profiler.profile:
        """
        Build a profiler object with the current configuration.

        Returns:
            torch.profiler.profile: The profiler object.
        """
        activities: Optional[list[ProfilerActivity]] = None
        if self.activities is not None:
            activities = [self._get_profiler_activity(activity) for activity in self.activities]
        schedule: Optional[torch.profiler.schedule] = None
        if self.schedule_option is not None:
            schedule = torch.profiler.schedule(**self.schedule_option)

        return torch.profiler.profile(
            activities=activities,
            schedule=schedule,
            on_trace_ready=self.on_trace_ready,
            record_shapes=self.record_shapes,
            profile_memory=self.profile_memory,
            with_stack=self.with_stack,
            with_flops=self.with_flops,
            with_modules=self.with_modules,
        )


class DistributedType(str, enum.Enum):
    """
    Represents a type of distributed environment.

    Values:

        - **NO** -- Not a distributed environment, just a single process.
        - **MULTI_CPU** -- Distributed on multiple CPU nodes.
        - **MULTI_GPU** -- Distributed on multiple GPUs.
        - **MULTI_MLU** -- Distributed on multiple MLUs.
        - **MULTI_SDAA** -- Distributed on multiple SDAAs.
        - **MULTI_MUSA** -- Distributed on multiple MUSAs.
        - **MULTI_NPU** -- Distributed on multiple NPUs.
        - **MULTI_XPU** -- Distributed on multiple XPUs.
        - **MULTI_HPU** -- Distributed on multiple HPUs.
        - **MULTI_NEURON** -- Distributed on multiple Neuron cores.
        - **DEEPSPEED** -- Using DeepSpeed.
        - **FSDP** -- Using Fully Sharded Data Parallelism (FSDP).
        - **XLA** -- Using TorchXLA.
        - **MEGATRON_LM** -- Using Megatron-LM.
    """

    # Subclassing str as well as Enum allows the `DistributedType` to be JSON-serializable out of the box.
    NO = "NO"
    MULTI_CPU = "MULTI_CPU"
    MULTI_GPU = "MULTI_GPU"
    MULTI_NPU = "MULTI_NPU"
    MULTI_MLU = "MULTI_MLU"
    MULTI_SDAA = "MULTI_SDAA"
    MULTI_MUSA = "MULTI_MUSA"
    MULTI_XPU = "MULTI_XPU"
    DEEPSPEED = "DEEPSPEED"
    FSDP = "FSDP"
    XLA = "XLA"
    MEGATRON_LM = "MEGATRON_LM"
    MULTI_HPU = "MULTI_HPU"
    MULTI_NEURON = "MULTI_NEURON"


class SageMakerDistributedType(str, enum.Enum):
    """
    Represents a type of distributed environment.

    Values:

        - **NO** -- Not a distributed environment, just a single process.
        - **DATA_PARALLEL** -- using sagemaker distributed data parallelism.
        - **MODEL_PARALLEL** -- using sagemaker distributed model parallelism.
    """

    # Subclassing str as well as Enum allows the `SageMakerDistributedType` to be JSON-serializable out of the box.
    NO = "NO"
    DATA_PARALLEL = "DATA_PARALLEL"
    MODEL_PARALLEL = "MODEL_PARALLEL"


class FP8BackendType(str, enum.Enum):
    """
    Represents the backend used for FP8.

    Values:

        - **TE** -- using TransformerEngine.
        - **MSAMP** -- using msamp.
    """

    # Subclassing str as well as Enum allows the `FP8BackendType` to be JSON-serializable out of the box.
    NO = "NO"
    TE = "TE"
    MSAMP = "MSAMP"
    AO = "AO"


class ComputeEnvironment(str, enum.Enum):
    """
    Represents a type of the compute environment.

    Values:

        - **LOCAL_MACHINE** -- private/custom cluster hardware.
        - **AMAZON_SAGEMAKER** -- Amazon SageMaker as compute environment.
    """

    # Subclassing str as well as Enum allows the `ComputeEnvironment` to be JSON-serializable out of the box.
    LOCAL_MACHINE = "LOCAL_MACHINE"
    AMAZON_SAGEMAKER = "AMAZON_SAGEMAKER"


class DynamoBackend(str, BaseEnum):
    """
    Represents a dynamo backend (see https://pytorch.org/docs/stable/torch.compiler.html).

    Values:

        - **NO** -- Do not use torch dynamo.
        - **EAGER** -- Uses PyTorch to run the extracted GraphModule. This is quite useful in debugging TorchDynamo
          issues.
        - **AOT_EAGER** -- Uses AotAutograd with no compiler, i.e, just using PyTorch eager for the AotAutograd's
          extracted forward and backward graphs. This is useful for debugging, and unlikely to give speedups.
        - **INDUCTOR** -- Uses TorchInductor backend with AotAutograd and cudagraphs by leveraging codegened Triton
          kernels. [Read
          more](https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747)
        - **AOT_TS_NVFUSER** -- nvFuser with AotAutograd/TorchScript. [Read
          more](https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593)
        - **NVPRIMS_NVFUSER** -- nvFuser with PrimTorch. [Read
          more](https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593)
        - **CUDAGRAPHS** -- cudagraphs with AotAutograd. [Read more](https://github.com/pytorch/torchdynamo/pull/757)
        - **OFI** -- Uses Torchscript optimize_for_inference. Inference only. [Read
          more](https://pytorch.org/docs/stable/generated/torch.jit.optimize_for_inference.html)
        - **FX2TRT** -- Uses Nvidia TensorRT for inference optimizations. Inference only. [Read
          more](https://github.com/pytorch/TensorRT/blob/master/docsrc/tutorials/getting_started_with_fx_path.rst)
        - **ONNXRT** -- Uses ONNXRT for inference on CPU/GPU. Inference only. [Read more](https://onnxruntime.ai/)
        - **TENSORRT** -- Uses ONNXRT to run TensorRT for inference optimizations. [Read
          more](https://github.com/onnx/onnx-tensorrt)
        - **AOT_TORCHXLA_TRACE_ONCE** -- Uses Pytorch/XLA with TorchDynamo optimization, for training. [Read
          more](https://github.com/pytorch/xla/blob/r2.0/docs/dynamo.md)
        - **TORCHXLA_TRACE_ONCE** -- Uses Pytorch/XLA with TorchDynamo optimization, for inference. [Read
          more](https://github.com/pytorch/xla/blob/r2.0/docs/dynamo.md)
        - **TVM** -- Uses Apache TVM for inference optimizations. [Read more](https://tvm.apache.org/)
        - **HPU_BACKEND** -- Uses HPU backend for inference optimizations.

    """

    # Subclassing str as well as Enum allows the `SageMakerDistributedType` to be JSON-serializable out of the box.
    NO = "NO"
    EAGER = "EAGER"
    AOT_EAGER = "AOT_EAGER"
    INDUCTOR = "INDUCTOR"
    AOT_TS_NVFUSER = "AOT_TS_NVFUSER"
    NVPRIMS_NVFUSER = "NVPRIMS_NVFUSER"
    CUDAGRAPHS = "CUDAGRAPHS"
    OFI = "OFI"
    FX2TRT = "FX2TRT"
    ONNXRT = "ONNXRT"
    TENSORRT = "TENSORRT"
    AOT_TORCHXLA_TRACE_ONCE = "AOT_TORCHXLA_TRACE_ONCE"
    TORCHXLA_TRACE_ONCE = "TORCHXLA_TRACE_ONCE"
    TVM = "TVM"
    HPU_BACKEND = "HPU_BACKEND"


class LoggerType(BaseEnum):
    """Represents a type of supported experiment tracker

    Values:

        - **ALL** -- all available trackers in the environment that are supported
        - **TENSORBOARD** -- TensorBoard as an experiment tracker
        - **WANDB** -- wandb as an experiment tracker
        - **TRACKIO** -- trackio as an experiment tracker
        - **COMETML** -- comet_ml as an experiment tracker
        - **MLFLOW** -- mlflow as an experiment tracker
        - **CLEARML** -- clearml as an experiment tracker
        - **DVCLIVE** -- dvclive as an experiment tracker
        - **SWANLAB** -- swanlab as an experiment tracker
    """

    ALL = "all"
    AIM = "aim"
    TENSORBOARD = "tensorboard"
    WANDB = "wandb"
    TRACKIO = "trackio"
    COMETML = "comet_ml"
    MLFLOW = "mlflow"
    CLEARML = "clearml"
    DVCLIVE = "dvclive"
    SWANLAB = "swanlab"


class PrecisionType(str, BaseEnum):
    """Represents a type of precision used on floating point values

    Values:

        - **NO** -- using full precision (FP32)
        - **FP16** -- using half precision
        - **BF16** -- using brain floating point precision
    """

    NO = "no"
    FP8 = "fp8"
    FP16 = "fp16"
    BF16 = "bf16"


class RNGType(BaseEnum):
    TORCH = "torch"
    CUDA = "cuda"
    MLU = "mlu"
    SDAA = "sdaa"
    MUSA = "musa"
    NPU = "npu"
    XLA = "xla"
    XPU = "xpu"
    HPU = "hpu"
    NEURON = "neuron"
    GENERATOR = "generator"


class CustomDtype(enum.Enum):
    r"""
    An enum that contains multiple custom dtypes that can be used for `infer_auto_device_map`.
    """

    FP8 = "fp8"
    INT4 = "int4"
    INT2 = "int2"


# data classes


@dataclass
class TensorInformation:
    shape: torch.Size
    dtype: torch.dtype


@dataclass
class DataLoaderConfiguration:
    """
    Configuration for dataloader-related items when calling `accelerator.prepare`.

    Args:
        split_batches (`bool`, defaults to `False`):
            Whether or not the accelerator should split the batches yielded by the dataloaders across the devices. If
            `True`, the actual batch size used will be the same on any kind of distributed processes, but it must be a
            round multiple of `num_processes` you are using. If `False`, actual batch size used will be the one set in
            your script multiplied by the number of processes.
        dispatch_batches (`bool`, defaults to `None`):
            If set to `True`, the dataloader prepared by the Accelerator is only iterated through on the main process
            and then the batches are split and broadcast to each process. Will default to `True` for `DataLoader` whose
            underlying dataset is an `IterableDataset`, `False` otherwise.
        even_batches (`bool`, defaults to `True`):
            If set to `True`, in cases where the total batch size across all processes does not exactly divide the
            dataset, samples at the start of the dataset will be duplicated so the batch can be divided equally among
            all workers.
        use_seedable_sampler (`bool`, defaults to `False`):
            Whether or not use a fully seedable random sampler ([`data_loader.SeedableRandomSampler`]). Ensures
            training results are fully reproducible using a different sampling technique. While seed-to-seed results
            may differ, on average the differences are negligible when using multiple different seeds to compare.
            Should also be ran with [`~utils.set_seed`] for the best results.
        data_seed (`int`, defaults to `None`):
            The seed to use for the underlying generator when using `use_seedable_sampler`. If `None`, the generator
            will use the current default seed from torch.
        non_blocking (`bool`, defaults to `False`):
            If set to `True`, the dataloader prepared by the Accelerator will utilize non-blocking host-to-device
            transfers, allowing for better overlap between dataloader communication and computation. Recommended that
            the prepared dataloader has `pin_memory` set to `True` to work properly.
        use_stateful_dataloader (`bool`, defaults to `False`):
            If set to `True`, the dataloader prepared by the Accelerator will be backed by
            [torchdata.StatefulDataLoader](https://github.com/pytorch/data/tree/main/torchdata/stateful_dataloader).
            This requires `torchdata` version 0.8.0 or higher that supports StatefulDataLoader to be installed.
    """

    split_batches: bool = field(
        default=False,
        metadata={
            "help": "Whether or not the accelerator should split the batches yielded by the dataloaders across the devices. If"
            " `True` the actual batch size used will be the same on any kind of distributed processes, but it must be a"
            " round multiple of the `num_processes` you are using. If `False`, actual batch size used will be the one set"
            " in your script multiplied by the number of processes."
        },
    )
    dispatch_batches: bool = field(
        default=None,
        metadata={
            "help": "If set to `True`, the dataloader prepared by the Accelerator is only iterated through on the main process"
            " and then the batches are split and broadcast to each process. Will default to `True` for `DataLoader` whose"
            " underlying dataset is an `IterableDataset`, `False` otherwise."
        },
    )
    even_batches: bool = field(
        default=True,
        metadata={
            "help": "If set to `True`, in cases where the total batch size across all processes does not exactly divide the"
            " dataset, samples at the start of the dataset will be duplicated so the batch can be divided equally among"
            " all workers."
        },
    )
    use_seedable_sampler: bool = field(
        default=False,
        metadata={
            "help": "Whether or not use a fully seedable random sampler ([`data_loader.SeedableRandomSampler`])."
            "Ensures training results are fully reproducible using a different sampling technique. "
            "While seed-to-seed results may differ, on average the differences are negligible when using"
            "multiple different seeds to compare. Should also be ran with [`~utils.set_seed`] for the best results."
        },
    )
    data_seed: int = field(
        default=None,
        metadata={
            "help": "The seed to use for the underlying generator when using `use_seedable_sampler`. If `None`, the generator"
            " will use the current default seed from torch."
        },
    )
    non_blocking: bool = field(
        default=False,
        metadata={
            "help": "If set to `True`, the dataloader prepared by the Accelerator will utilize non-blocking host-to-device"
            " transfers, allowing for better overlap between dataloader communication and computation.  Recommended that the"
            " prepared dataloader has `pin_memory` set to `True` to work properly."
        },
    )
    use_stateful_dataloader: bool = field(
        default=False,
        metadata={
            "help": "If set to `True`, the dataloader prepared by the Accelerator will be backed by "
            "[torchdata.StatefulDataLoader](https://github.com/pytorch/data/tree/main/torchdata/stateful_dataloader). This requires `torchdata` version 0.8.0 or higher that supports StatefulDataLoader to be installed."
        },
    )


@dataclass
class ProjectConfiguration:
    """
    Configuration for the Accelerator object based on inner-project needs.

    Args:
        project_dir (`str`, defaults to `None`):
            A path to a directory for storing data.
        logging_dir (`str`, defaults to `None`):
            A path to a directory for storing logs of locally-compatible loggers. If None, defaults to `project_dir`.
        automatic_checkpoint_naming (`bool`, defaults to `False`):
            Whether saved states should be automatically iteratively named.
        total_limit (`int`, defaults to `None`):
            The maximum number of total saved states to keep.
        iteration (`int`, defaults to `0`):
            The current save iteration.
        save_on_each_node (`bool`, defaults to `False`):
            When doing multi-node distributed training, whether to save models and checkpoints on each node, or only on
            the main one.
    """

    project_dir: str = field(default=None, metadata={"help": "A path to a directory for storing data."})
    logging_dir: str = field(
        default=None,
        metadata={
            "help": "A path to a directory for storing logs of locally-compatible loggers. If None, defaults to `project_dir`."
        },
    )
    automatic_checkpoint_naming: bool = field(
        default=False,
        metadata={"help": "Whether saved states should be automatically iteratively named."},
    )

    total_limit: int = field(
        default=None,
        metadata={"help": "The maximum number of total saved states to keep."},
    )

    iteration: int = field(
        default=0,
        metadata={"help": "The current save iteration."},
    )

    save_on_each_node: bool = field(
        default=False,
        metadata={
            "help": (
                "When doing multi-node distributed training, whether to save models and checkpoints on each node, or"
                " only on the main one"
            )
        },
    )

    def set_directories(self, project_dir: Optional[str] = None):
        "Sets `self.project_dir` and `self.logging_dir` to the appropriate values."
        self.project_dir = project_dir
        if self.logging_dir is None:
            self.logging_dir = project_dir

    def __post_init__(self):
        self.set_directories(self.project_dir)


@dataclass
class GradientAccumulationPlugin(KwargsHandler):
    """
    A plugin to configure gradient accumulation behavior. You can only pass one of `gradient_accumulation_plugin` or
    `gradient_accumulation_steps` to [`Accelerator`]. Passing both raises an error.

    Parameters:
        num_steps (`int`):
            The number of steps to accumulate gradients for.
        adjust_scheduler (`bool`, *optional*, defaults to `True`):
            Whether to adjust the scheduler steps to account for the number of steps being accumulated. Should be
            `True` if the used scheduler was not adjusted for gradient accumulation.
        sync_with_dataloader (`bool`, *optional*, defaults to `True`):
            Whether to synchronize setting the gradients when at the end of the dataloader.
        sync_each_batch (`bool`, *optional*):
                Whether to synchronize setting the gradients at each data batch. Setting to `True` may reduce memory
                requirements when using gradient accumulation with distributed training, at expense of speed.

    Example:

    ```python
    from accelerate.utils import GradientAccumulationPlugin

    gradient_accumulation_plugin = GradientAccumulationPlugin(num_steps=2)
    accelerator = Accelerator(gradient_accumulation_plugin=gradient_accumulation_plugin)
    ```
    """

    num_steps: int = field(
        default=None,
        metadata={"help": "The number of steps to accumulate gradients for."},
    )
    adjust_scheduler: bool = field(
        default=True,
        metadata={
            "help": "Whether to adjust the scheduler steps to account for the number of steps being accumulated. Should be `True` if the used scheduler was not adjusted for gradient accumulation."
        },
    )
    sync_with_dataloader: bool = field(
        default=True,
        metadata={
            "help": "Whether to synchronize setting the gradients when at the end of the dataloader. Should only be set to `False` if you know what you're doing."
        },
    )
    sync_each_batch: bool = field(
        default=False,
        metadata={
            "help": "Whether to synchronize setting the gradients at each data batch. Setting to `True` may reduce memory requirements when using gradient accumulation with distributed training, at expense of speed."
        },
    )


@dataclass
class TorchDynamoPlugin(KwargsHandler):
    """
    This plugin is used to compile a model with PyTorch 2.0

    Args:
        backend (`DynamoBackend`, defaults to `None`):
            A valid Dynamo backend. See https://pytorch.org/docs/stable/torch.compiler.html for more details.
        mode (`str`, defaults to `None`):
            Possible options are 'default', 'reduce-overhead' or 'max-autotune'.
        fullgraph (`bool`, defaults to `None`):
            Whether it is ok to break model into several subgraphs.
        dynamic (`bool`, defaults to `None`):
            Whether to use dynamic shape for tracing.
        options (`Any`, defaults to `None`):
            A dictionary of options to pass to the backend.
        disable (`bool`, defaults to `False`):
            Turn torch.compile() into a no-op for testing
        use_regional_compilation (`bool`, defaults to `None`):
            Use it to reduce the cold start compilation time of torch.compile() by targeting repeated blocks of the
            same class and compiling them sequentially to hit the compiler's cache. For example, in `GPT2LMHeadModel`,
            the repeated block/class is `GPT2Block`, and can be accessed as `model.transformer.h[0]`. The rest of the
            model (e.g model.lm_head) is compiled separately.
    """

    backend: DynamoBackend = field(
        default=None,
        metadata={"help": f"Possible options are {[b.value.lower() for b in DynamoBackend]}"},
    )
    mode: str = field(
        default=None,
        metadata={"help": "Possible options are 'default', 'reduce-overhead' or 'max-autotune'"},
    )
    fullgraph: bool = field(
        default=None,
        metadata={"help": "Whether it is ok to break model into several subgraphs"},
    )
    dynamic: bool = field(default=None, metadata={"help": "Whether to use dynamic shape for tracing"})
    options: Any = field(
        default=None,
        metadata={"help": "A dictionary of options to pass to the backend."},
    )
    disable: bool = field(
        default=False,
        metadata={"help": "Turn torch.compile() into a no-op for testing"},
    )

    use_regional_compilation: bool = field(
        default=None,
        metadata={
            "help": (
                # https://pytorch.org/tutorials/recipes/regional_compilation.html
                "Use it to reduce the cold start compilation time of torch.compile() by targeting repeated "
                "blocks of the same class and compiling them sequentially to hit the compiler's cache. For "
                "example, in `GPT2LMHeadModel`, the repeated block/class is `GPT2Block`, and can be accessed "
                "as `model.transformer.h[0]`. The rest of the model (e.g model.lm_head) is compiled separately."
            )
        },
    )

    def __post_init__(self):
        prefix = "ACCELERATE_DYNAMO_"
        if self.backend is None:
            self.backend = os.environ.get(prefix + "BACKEND", "no")
        self.backend = DynamoBackend(self.backend.upper())

        if self.mode is None:
            self.mode = os.environ.get(prefix + "MODE", "default")
        if self.fullgraph is None:
            self.fullgraph = str_to_bool(os.environ.get(prefix + "USE_FULLGRAPH", "False")) == 1
        if self.use_regional_compilation is None:
            self.use_regional_compilation = (
                str_to_bool(os.environ.get(prefix + "USE_REGIONAL_COMPILATION", "False")) == 1
            )

        if self.dynamic is None and os.environ.get(prefix + "USE_DYNAMIC", None) is not None:
            self.dynamic = str_to_bool(os.environ.get(prefix + "USE_DYNAMIC", "False")) == 1

    def to_dict(self):
        dynamo_config = copy.deepcopy(self.__dict__)
        dynamo_config["backend"] = dynamo_config["backend"].value.lower()
        return dynamo_config

    def to_kwargs(self):
        kwargs = super().to_kwargs()
        kwargs.pop("use_regional_compilation", None)
        return kwargs


@dataclass
class DeepSpeedPlugin:
    """
    This plugin is used to integrate DeepSpeed.

    Args:
        hf_ds_config (`Any`, defaults to `None`):
            Path to DeepSpeed config file or dict or an object of class `accelerate.utils.deepspeed.HfDeepSpeedConfig`.
        gradient_accumulation_steps (`int`, defaults to `None`):
            Number of steps to accumulate gradients before updating optimizer states. If not set, will use the value
            from the `Accelerator` directly.
        gradient_clipping (`float`, defaults to `None`):
            Enable gradient clipping with value.
        zero_stage (`int`, defaults to `None`):
            Possible options are 0, 1, 2, 3. Default will be taken from environment variable.
        is_train_batch_min (`bool`, defaults to `True`):
            If both train & eval dataloaders are specified, this will decide the `train_batch_size`.
        offload_optimizer_device (`str`, defaults to `None`):
            Possible options are none|cpu|nvme. Only applicable with ZeRO Stages 2 and 3.
        offload_param_device (`str`, defaults to `None`):
            Possible options are none|cpu|nvme. Only applicable with ZeRO Stage 3.
        offload_optimizer_nvme_path (`str`, defaults to `None`):
            Possible options are /nvme|/local_nvme. Only applicable with ZeRO Stage 3.
        offload_param_nvme_path (`str`, defaults to `None`):
            Possible options are /nvme|/local_nvme. Only applicable with ZeRO Stage 3.
        zero3_init_flag (`bool`, defaults to `None`):
            Flag to indicate whether to save 16-bit model. Only applicable with ZeRO Stage-3.
        zero3_save_16bit_model (`bool`, defaults to `None`):
            Flag to indicate whether to save 16-bit model. Only applicable with ZeRO Stage-3.
        transformer_moe_cls_names (`str`, defaults to `None`):
            Comma-separated list of Transformers MoE layer class names (case-sensitive). For example,
            `MixtralSparseMoeBlock`, `Qwen2MoeSparseMoeBlock`, `JetMoEAttention`, `JetMoEBlock`, etc.
        enable_msamp (`bool`, defaults to `None`):
            Flag to indicate whether to enable MS-AMP backend for FP8 training.
        msasmp_opt_level (`Optional[Literal["O1", "O2"]]`, defaults to `None`):
            Optimization level for MS-AMP (defaults to 'O1'). Only applicable if `enable_msamp` is True. Should be one
            of ['O1' or 'O2'].
    """

    hf_ds_config: Any = field(
        default=None,
        metadata={
            "help": "path to DeepSpeed config file or dict or an object of class `accelerate.utils.deepspeed.HfDeepSpeedConfig`."
        },
    )
    gradient_accumulation_steps: int = field(
        default=None,
        metadata={
            "help": "Number of steps to accumulate gradients before updating optimizer states. If not set, will use the value from the `Accelerator` directly."
        },
    )
    gradient_clipping: float = field(default=None, metadata={"help": "Enable gradient clipping with value"})
    zero_stage: int = field(
        default=None,
        metadata={"help": "Possible options are 0,1,2,3; Default will be taken from environment variable"},
    )
    is_train_batch_min: bool = field(
        default=True,
        metadata={"help": "If both train & eval dataloaders are specified, this will decide the train_batch_size"},
    )
    offload_optimizer_device: str = field(
        default=None,
        metadata={"help": "Possible options are none|cpu|nvme. Only applicable with ZeRO Stages 2 and 3."},
    )
    offload_param_device: str = field(
        default=None,
        metadata={"help": "Possible options are none|cpu|nvme. Only applicable with ZeRO Stage 3."},
    )
    offload_optimizer_nvme_path: str = field(
        default=None,
        metadata={"help": "Possible options are /nvme|/local_nvme. Only applicable with ZeRO Stage 3."},
    )
    offload_param_nvme_path: str = field(
        default=None,
        metadata={"help": "Possible options are /nvme|/local_nvme. Only applicable with ZeRO Stage 3."},
    )
    zero3_init_flag: bool = field(
        default=None,
        metadata={
            "help": "Flag to indicate whether to enable `deepspeed.zero.Init` for constructing massive models."
            "Only applicable with ZeRO Stage-3."
        },
    )
    zero3_save_16bit_model: bool = field(
        default=None,
        metadata={"help": "Flag to indicate whether to save 16-bit model. Only applicable with ZeRO Stage-3."},
    )
    transformer_moe_cls_names: str = field(
        default=None,
        metadata={
            "help": "comma-separated list of transformers MoE layer class names (case-sensitive), e.g : "
            " `MixtralSparseMoeBlock`, `Qwen2MoeSparseMoeBlock`, `JetMoEAttention,JetMoEBlock` ..."
        },
    )
    enable_msamp: bool = field(
        default=None,
        metadata={"help": "Flag to indicate whether to enable MS-AMP backend for FP8 training."},
    )
    msamp_opt_level: Optional[Literal["O1", "O2"]] = field(
        default=None,
        metadata={
            "help": "Optimization level for MS-AMP (defaults to 'O1'). Only applicable if `enable_msamp` is True. Should be one of ['O1' or 'O2']."
        },
    )

    def __post_init__(self):
        from .deepspeed import HfDeepSpeedConfig

        if self.gradient_accumulation_steps is None:
            gas = os.environ.get("ACCELERATE_GRADIENT_ACCUMULATION_STEPS", "auto")
            self.gradient_accumulation_steps = int(gas) if gas.isdigit() else gas

        if self.gradient_clipping is None:
            gradient_clipping = os.environ.get("ACCELERATE_GRADIENT_CLIPPING", "auto")
            self.gradient_clipping = gradient_clipping if gradient_clipping == "auto" else float(gradient_clipping)

        if self.zero_stage is None:
            self.zero_stage = int(os.environ.get("ACCELERATE_DEEPSPEED_ZERO_STAGE", 2))

        if self.offload_optimizer_device is None:
            self.offload_optimizer_device = os.environ.get("ACCELERATE_DEEPSPEED_OFFLOAD_OPTIMIZER_DEVICE", "none")

        if self.offload_param_device is None:
            self.offload_param_device = os.environ.get("ACCELERATE_DEEPSPEED_OFFLOAD_PARAM_DEVICE", "none")

        if self.offload_optimizer_nvme_path is None:
            self.offload_optimizer_nvme_path = os.environ.get(
                "ACCELERATE_DEEPSPEED_OFFLOAD_OPTIMIZER_NVME_PATH", "none"
            )

        if self.offload_param_nvme_path is None:
            self.offload_param_nvme_path = os.environ.get("ACCELERATE_DEEPSPEED_OFFLOAD_PARAM_NVME_PATH", "none")

        if self.zero3_save_16bit_model is None:
            self.zero3_save_16bit_model = (
                os.environ.get("ACCELERATE_DEEPSPEED_ZERO3_SAVE_16BIT_MODEL", "false").lower() == "true"
            )
        if self.enable_msamp is None:
            self.enable_msamp = os.environ.get("ACCELERATE_FP8_BACKEND", None) == "MSAMP"

        if self.msamp_opt_level is None:
            self.msamp_opt_level = os.environ.get("ACCELERATE_FP8_OPT_LEVEL", "O1")

        if self.hf_ds_config is None:
            self.hf_ds_config = os.environ.get("ACCELERATE_DEEPSPEED_CONFIG_FILE", "none")

        if (
            isinstance(self.hf_ds_config, dict)
            or (isinstance(self.hf_ds_config, str) and self.hf_ds_config != "none")
            or isinstance(self.hf_ds_config, HfDeepSpeedConfig)
        ):
            if not isinstance(self.hf_ds_config, HfDeepSpeedConfig):
                self.hf_ds_config = HfDeepSpeedConfig(self.hf_ds_config)
            if "gradient_accumulation_steps" not in self.hf_ds_config.config:
                self.hf_ds_config.config["gradient_accumulation_steps"] = 1
            if "zero_optimization" not in self.hf_ds_config.config:
                raise ValueError("Please specify the ZeRO optimization config in the DeepSpeed config.")

            self._deepspeed_config_checks()
            plugin_to_config_mapping = {
                "gradient_accumulation_steps": "gradient_accumulation_steps",
                "gradient_clipping": "gradient_clipping",
                "zero_stage": "zero_optimization.stage",
                "offload_optimizer_device": "zero_optimization.offload_optimizer.device",
                "offload_param_device": "zero_optimization.offload_param.device",
                "offload_param_nvme_path": "zero_optimization.offload_param.nvme_path",
                "offload_optimizer_nvme_path": "zero_optimization.offload_optimizer.nvme_path",
                "zero3_save_16bit_model": "zero_optimization.stage3_gather_16bit_weights_on_model_save",
            }
            kwargs = {v: getattr(self, k) for k, v in plugin_to_config_mapping.items() if getattr(self, k) is not None}
            for key in kwargs.keys():
                self.fill_match(key, **kwargs, must_match=False)
            self.hf_ds_config.set_stage_and_offload()

            # filling the missing values in the class attributes from the DeepSpeed config
            # when using the DeepSpeed config file.
            for key, value in plugin_to_config_mapping.items():
                config_value = self.hf_ds_config.get_value(value)
                if config_value is not None and config_value != "auto":
                    setattr(self, key, config_value)
        else:
            config = {
                "train_batch_size": "auto",
                "train_micro_batch_size_per_gpu": "auto",
                "gradient_accumulation_steps": self.gradient_accumulation_steps,
                "zero_optimization": {
                    "stage": self.zero_stage,
                    "offload_optimizer": {
                        "device": self.offload_optimizer_device,
                        "nvme_path": (
                            self.offload_optimizer_nvme_path if self.offload_optimizer_device == "nvme" else None
                        ),
                    },
                    "offload_param": {
                        "device": self.offload_param_device,
                        "nvme_path": (self.offload_param_nvme_path if self.offload_param_device == "nvme" else None),
                    },
                    "stage3_gather_16bit_weights_on_model_save": self.zero3_save_16bit_model,
                },
            }
            if self.gradient_clipping:
                config["gradient_clipping"] = self.gradient_clipping
            self.hf_ds_config = HfDeepSpeedConfig(config)

        self.deepspeed_config = self.hf_ds_config.config
        self.deepspeed_config["steps_per_print"] = float("inf")  # this will stop deepspeed from logging @ stdout
        if self.zero3_init_flag is None:
            self.zero3_init_flag = (
                str_to_bool(
                    os.environ.get(
                        "ACCELERATE_DEEPSPEED_ZERO3_INIT",
                        str(self.hf_ds_config.is_zero3()),
                    )
                )
                == 1
            )
        if self.zero3_init_flag and not self.hf_ds_config.is_zero3():
            warnings.warn("DeepSpeed Zero3 Init flag is only applicable for ZeRO Stage 3. Setting it to False.")
            self.zero3_init_flag = False
        # NOTE: Set to False by default, will be set to `True` automatically if it's the first plugin passed
        # to the `Accelerator`'s `deepspeed_plugin` param, *or* `AcceleratorState().enable_deepspeed_plugin(plugin_key)` is manually called
        self._set_selected(False)

        # Ignore if it's already set
        if self.enable_msamp and "msamp" not in self.deepspeed_config:
            if self.zero_stage == 3:
                raise NotImplementedError(
                    "MS-AMP is not supported for ZeRO Stage 3. Please use ZeRO Stage 0, 1, or 2 instead."
                )
            if self.msamp_opt_level not in ["O1", "O2"]:
                raise ValueError("Invalid optimization level for MS-AMP. Please use one of ['O1' or'O2'].")
            self.deepspeed_config["msamp"] = {
                "enabled": True,
                "opt_level": self.msamp_opt_level,
            }

    def fill_match(self, ds_key_long, mismatches=None, must_match=True, **kwargs):
        mismatches = [] if mismatches is None else mismatches
        config, ds_key = self.hf_ds_config.find_config_node(ds_key_long)
        if config is None:
            return

        if config.get(ds_key) == "auto":
            if ds_key_long in kwargs:
                config[ds_key] = kwargs[ds_key_long]
                return
            else:
                raise ValueError(
                    f"`{ds_key_long}` not found in kwargs. "
                    f"Please specify `{ds_key_long}` without `auto` (set to correct value) in the DeepSpeed config file or "
                    "pass it in kwargs."
                )

        if not must_match:
            return

        ds_val = config.get(ds_key)
        if ds_val is not None and ds_key_long in kwargs:
            if ds_val != kwargs[ds_key_long]:
                mismatches.append(f"- ds {ds_key_long}={ds_val} vs arg {ds_key_long}={kwargs[ds_key_long]}")

    def is_auto(self, ds_key_long):
        val = self.hf_ds_config.get_value(ds_key_long)
        if val is None:
            return False
        else:
            return val == "auto"

    def get_value(self, ds_key_long, default=None):
        return self.hf_ds_config.get_value(ds_key_long, default)

    def deepspeed_config_process(self, prefix="", mismatches=None, config=None, must_match=True, **kwargs):
        """Process the DeepSpeed config with the values from the kwargs."""
        mismatches = [] if mismatches is None else mismatches
        if config is None:
            config = self.deepspeed_config
        for key, value in config.items():
            if isinstance(value, dict):
                self.deepspeed_config_process(
                    prefix=prefix + key + ".",
                    mismatches=mismatches,
                    config=value,
                    must_match=must_match,
                    **kwargs,
                )
            else:
                self.fill_match(prefix + key, mismatches, must_match=must_match, **kwargs)
        if len(mismatches) > 0 and prefix == "":
            mismatches_msg = "\n".join(mismatches)
            raise ValueError(
                "Please correct the following DeepSpeed config values that mismatch kwargs "
                f" values:\n{mismatches_msg}\nThe easiest method is to set these DeepSpeed config values to 'auto'."
            )

    def set_mixed_precision(self, mixed_precision):
        ds_config = self.deepspeed_config
        kwargs = {
            "fp16.enabled": mixed_precision == "fp16",
            # When training in fp8, we still rely on bf16 autocast for the core mixed precision
            "bf16.enabled": mixed_precision in ("bf16", "fp8"),
        }
        if mixed_precision == "fp16":
            if "fp16" not in ds_config:
                ds_config["fp16"] = {"enabled": True, "auto_cast": True}
        elif mixed_precision in ("bf16", "fp8"):
            if "bf16" not in ds_config:
                ds_config["bf16"] = {"enabled": True}

        if mixed_precision == "fp8" and self.enable_msamp:
            if "msamp" not in ds_config:
                ds_config["msamp"] = {
                    "enabled": True,
                    "opt_level": self.msamp_opt_level,
                }

        if mixed_precision != "no":
            diff_dtype = "bf16" if mixed_precision == "fp16" else "fp16"
            if str(ds_config.get(diff_dtype, {}).get("enabled", "False")).lower() == "true":
                raise ValueError(
                    f"`--mixed_precision` arg cannot be set to `{mixed_precision}` when `{diff_dtype}` is set in the DeepSpeed config file."
                )
        for dtype in ["fp16", "bf16"]:
            if dtype not in ds_config:
                ds_config[dtype] = {"enabled": False}
        self.fill_match("fp16.enabled", must_match=False, **kwargs)
        self.fill_match("bf16.enabled", must_match=False, **kwargs)

    def set_deepspeed_weakref(self):
        from .imports import is_transformers_available

        ds_config = copy.deepcopy(self.deepspeed_config)
        if self.zero3_init_flag:
            if not is_transformers_available():
                raise Exception(
                    "When `zero3_init_flag` is set, it requires Transformers to be installed. "
                    "Please run `pip install transformers`."
                )
        if "gradient_accumulation_steps" not in ds_config or ds_config["gradient_accumulation_steps"] == "auto":
            ds_config["gradient_accumulation_steps"] = 1
        if "train_micro_batch_size_per_gpu" not in ds_config or ds_config["train_micro_batch_size_per_gpu"] == "auto":
            ds_config["train_micro_batch_size_per_gpu"] = 1
        if ds_config.get("train_batch_size", None) == "auto":
            del ds_config["train_batch_size"]

        if compare_versions("transformers", "<", "4.46"):
            from transformers.deepspeed import (
                HfDeepSpeedConfig,
                unset_hf_deepspeed_config,
            )
        else:
            from transformers.integrations import (
                HfDeepSpeedConfig,
                unset_hf_deepspeed_config,
            )

        unset_hf_deepspeed_config()
        self.dschf = HfDeepSpeedConfig(ds_config)  # keep this object alive # noqa

    def is_zero3_init_enabled(self):
        return self.zero3_init_flag

    @contextmanager
    def zero3_init_context_manager(self, enable=False):
        old = self.zero3_init_flag
        if old == enable:
            yield
        else:
            self.zero3_init_flag = enable
            self.dschf = None
            self.set_deepspeed_weakref()
            yield
            self.zero3_init_flag = old
            self.dschf = None
            self.set_deepspeed_weakref()

    def _deepspeed_config_checks(self):
        env_variable_names_to_ignore = [
            "ACCELERATE_GRADIENT_ACCUMULATION_STEPS",
            "ACCELERATE_GRADIENT_CLIPPING",
            "ACCELERATE_DEEPSPEED_ZERO_STAGE",
            "ACCELERATE_DEEPSPEED_OFFLOAD_OPTIMIZER_DEVICE",
            "ACCELERATE_DEEPSPEED_OFFLOAD_PARAM_DEVICE",
            "ACCELERATE_DEEPSPEED_OFFLOAD_PARAM_NVME_PATH",
            "ACCELERATE_DEEPSPEED_OFFLOAD_OPTIMIZER_NVME_PATH",
            "ACCELERATE_DEEPSPEED_ZERO3_SAVE_16BIT_MODEL",
            "ACCELERATE_MIXED_PRECISION",
        ]
        env_variable_names_to_ignore = [
            name.replace("ACCELERATE_", "").replace("DEEPSPEED_", "").lower() for name in env_variable_names_to_ignore
        ]

        deepspeed_fields_from_accelerate_config = os.environ.get("ACCELERATE_CONFIG_DS_FIELDS", "").split(",")

        if any(name in env_variable_names_to_ignore for name in deepspeed_fields_from_accelerate_config):
            raise ValueError(
                f"When using `deepspeed_config_file`, the following accelerate config variables will be ignored: {env_variable_names_to_ignore}.\n"
                "Please specify them appropriately in the DeepSpeed config file.\n"
                "If you are using an accelerate config file, remove others config variables mentioned in the above specified list.\n"
                "The easiest method is to create a new config following the questionnaire via `accelerate config`.\n"
                "It will only ask for the necessary config variables when using `deepspeed_config_file`."
            )

    def set_moe_leaf_modules(self, model):
        if self.transformer_moe_cls_names is None:
            self.transformer_moe_cls_names = os.environ.get("ACCELERATE_DEEPSPEED_MOE_LAYER_CLS_NAMES", None)
        if self.transformer_moe_cls_names is not None:
            if compare_versions("deepspeed", "<", "0.14.0"):
                raise ImportError("DeepSpeed version must be >= 0.14.0 to use MOE support. Please update DeepSpeed.")
            from deepspeed.utils import set_z3_leaf_modules

            class_names = self.transformer_moe_cls_names.split(",")
            transformer_moe_cls = []
            for layer_class in class_names:
                transformer_cls = get_module_class_from_name(model, layer_class)
                if transformer_cls is None:
                    raise Exception(
                        f"Could not find a transformer layer class called '{layer_class}' to wrap in the model."
                    )
                else:
                    transformer_moe_cls.append(transformer_cls)
            set_z3_leaf_modules(model, transformer_moe_cls)  # z3_leaf

    def select(self, _from_accelerator_state: bool = False):
        """
        Sets the HfDeepSpeedWeakref to use the current deepspeed plugin configuration
        """
        if not _from_accelerator_state:
            raise ValueError(
                "A `DeepSpeedPlugin` object must be enabled manually by calling `AcceleratorState().enable_deepspeed_plugin(plugin_key)`."
            )
        self.set_deepspeed_weakref()
        self._set_selected(True)

    def _unselect(self):
        self._set_selected(False)

    def _set_selected(self, value: bool):
        """
        Private setter for the 'enabled' attribute.
        """
        self._selected = value

    @property
    def selected(self):
        return self._selected

    @selected.setter
    def selected(self, value):
        raise NotImplementedError(
            "'enabled' can only be set through calling 'AcceleratorState().enable_deepspeed_plugin(key)'."
        )


@dataclass
class FullyShardedDataParallelPlugin:
    """
    This plugin is used to enable fully sharded data parallelism.

    Args:
        fsdp_version (`int`, defaults to `1`):
            The version of FSDP to use. Defaults to 1. If set to 2, launcher expects the config to be converted to
            FSDP2 format.
        sharding_strategy (`Union[str, torch.distributed.fsdp.ShardingStrategy]`, defaults to `'FULL_SHARD'`):
            Sharding strategy to use. Should be either a `str` or an instance of
            `torch.distributed.fsdp.fully_sharded_data_parallel.ShardingStrategy`. Is deprecated in favor of
            `reshard_after_forward`.
        reshard_after_forward (`Union[str, torch.distributed.fsdp.ShardingStrategy, bool]`, defaults to `'FULL_SHARD'` for `fsdp_version=1` and `True` for `fsdp_version=2`):
            Sharding strategy to use. Should be a bool if `fsdp_version` is set to 2 else a `str` or an instance of
            `torch.distributed.fsdp.fully_sharded_data_parallel.ShardingStrategy`.
        backward_prefetch (`Union[str, torch.distributed.fsdp.BackwardPrefetch]`, defaults to `'NO_PREFETCH'`):
            Backward prefetch strategy to use. Should be either a `str` or an instance of
            `torch.distributed.fsdp.fully_sharded_data_parallel.BackwardPrefetch`.
        mixed_precision_policy (`Optional[Union[dict, str, torch.distributed.fsdp.MixedPrecision, torch.distributed.fsdp.MixedPrecisionPolicy]]`, defaults to `None`):
            A config to enable mixed precision training with FullyShardedDataParallel. If passing in a `dict`, it
            should have the following keys: `param_dtype`, `reduce_dtype`, and `buffer_dtype`, can be an instance of
            `torch.distributed.fsdp.MixedPrecisionPolicy` if `fsdp_version` is set to 2. If passing in a `str`, it
            should be one of the following values: fp8, fp16, bf16, fp32, and used to set `param_dtype`,
            `reduce_dtype`, and `buffer_dtype`.
        auto_wrap_policy (`Optional(Union[Callable, Literal["transformer_based_wrap", "size_based_wrap", "no_wrap"]]), defaults to `NO_WRAP`):
            A callable or string specifying a policy to recursively wrap layers with FSDP. If a string, it must be one
            of `transformer_based_wrap`, `size_based_wrap`, or `no_wrap`. See
            `torch.distributed.fsdp.wrap.size_based_wrap_policy` for a direction on what it should look like.
        cpu_offload (`Union[bool, torch.distributed.fsdp.CPUOffload, torch.distributed.fsdp.CPUOffloadPolicy]`, defaults to `False`):
            Whether to offload parameters to CPU. Should be either a `bool` or an instance of
            `torch.distributed.fsdp.fully_sharded_data_parallel.CPUOffload` or
            `torch.distributed.fsdp.fully_sharded_data_parallel.CPUOffloadPolicy` if `fsdp_version` is set to 2.
        ignored_modules (`Optional[Union[Iterable[torch.nn.Module], str]]`, defaults to `None`):
            A list of modules to ignore when wrapping with FSDP. When passing a string, will match the modules by name
            using regex fullmatch. If `fsdp_version` is set to 2, the modules are converted to parameters and used.
        state_dict_type (`Union[str, torch.distributed.fsdp.StateDictType]`, defaults to `'FULL_STATE_DICT'`):
            State dict type to use. If a string, it must be one of `full_state_dict`, `local_state_dict`, or
            `sharded_state_dict`.
        state_dict_config (`Optional[Union[torch.distributed.fsdp.FullStateDictConfig, torch.distributed.fsdp.ShardedStateDictConfig]`, defaults to `None`):
            State dict config to use. Is determined based on the `state_dict_type` if not passed in.
        optim_state_dict_config (`Optional[Union[torch.distributed.fsdp.FullOptimStateDictConfig, torch.distributed.fsdp.ShardedOptimStateDictConfig]`, defaults to `None`):
            Optim state dict config to use. Is determined based on the `state_dict_type` if not passed in.
        limit_all_gathers (`bool`, defaults to `True`):
            Whether to have FSDP explicitly synchronizes the CPU thread to prevent too many in-flight all-gathers. This
            bool only affects the sharded strategies that schedule all-gathers. Enabling this can help lower the number
            of CUDA malloc retries.
        use_orig_params (`bool`, defaults to `False`):
            Whether to use the original parameters for the optimizer.
        param_init_fn (`Optional[Callable[[torch.nn.Module], None]`, defaults to `None`):
            A `Callable[torch.nn.Module] -> None` that specifies how modules that are currently on the meta device
            should be initialized onto an actual device. Only applicable when `sync_module_states` is `True`. By
            default is a `lambda` which calls `to_empty` on the module.
        sync_module_states (`bool`, defaults to `False`):
            Whether each individually wrapped FSDP unit should broadcast module parameters from rank 0 to ensure they
            are the same across all ranks after initialization. Defaults to `False` unless `cpu_ram_efficient_loading`
            is `True`, then will be forcibly enabled.
        forward_prefetch (`bool`, defaults to `False`):
            Whether to have FSDP explicitly prefetches the next upcoming all-gather while executing in the forward
            pass. only use with Static graphs.
        activation_checkpointing (`bool`, defaults to `False`):
            A technique to reduce memory usage by clearing activations of certain layers and recomputing them during a
            backward pass. Effectively, this trades extra computation time for reduced memory usage.
        cpu_ram_efficient_loading (`bool`, defaults to `None`):
            If True, only the first process loads the pretrained model checkoint while all other processes have empty
            weights. Only applicable for Transformers. When using this, `sync_module_states` needs to be `True`.
        transformer_cls_names_to_wrap (`Optional[List[str]]`, defaults to `None`):
            A list of transformer layer class names to wrap. Only applicable when `auto_wrap_policy` is
            `transformer_based_wrap`.
        min_num_params (`Optional[int]`, defaults to `None`):
            The minimum number of parameters a module must have to be wrapped. Only applicable when `auto_wrap_policy`
            is `size_based_wrap`.
    """

    fsdp_version: int = field(
        default=None,
        metadata={
            "help": "The version of FSDP to use. Defaults to 1. If set to 2, launcher expects the config to be converted to FSDP2 format."
        },
    )

    sharding_strategy: Union[str, "torch.distributed.fsdp.ShardingStrategy"] = field(
        default=None,
        metadata={
            "help": "Sharding strategy to use. Should be either a `str` or an instance of `torch.distributed.fsdp.fully_sharded_data_parallel.ShardingStrategy`. Defaults to 'FULL_SHARD'. Is deprecated in favor of `reshard_after_forward` "
        },
    )

    reshard_after_forward: Union[str, "torch.distributed.fsdp.ShardingStrategy", bool] = field(
        default=None,
        metadata={
            "help": "Sharding strategy to use. Should be a bool if `fsdp_version` is set to 2 else a `str` or an instance of `torch.distributed.fsdp.fully_sharded_data_parallel.ShardingStrategy`. Defaults to 'FULL_SHARD'"
        },
    )
    backward_prefetch: Optional[Union[str, "torch.distributed.fsdp.BackwardPrefetch"]] = field(
        default=None,
        metadata={
            "help": "Backward prefetch strategy to use. Should be either a `str` or an instance of `torch.distributed.fsdp.fully_sharded_data_parallel.BackwardPrefetch`. Defaults to 'NO_PREFETCH'. This becomes obsolete in FSDP2."
        },
    )
    mixed_precision_policy: Optional[
        Union[
            dict,
            str,
            "torch.distributed.fsdp.MixedPrecision",
            "torch.distributed.fsdp.MixedPrecisionPolicy",
        ]
    ] = field(
        default=None,
        metadata={
            "help": "A config to enable mixed precision training with FullyShardedDataParallel. "
            "If passing in a `dict`, it should have the following keys: `param_dtype`, `reduce_dtype`, and `buffer_dtype`."
            "Can also be an instance of `torch.distributed.fsdp.MixedPrecisionPolicy` if `fsdp_version` is set to 2."
        },
    )
    auto_wrap_policy: Optional[Union[Callable, Literal["transformer_based_wrap", "size_based_wrap", "no_wrap"]]] = (
        field(
            default=None,
            metadata={
                "help": "A callable or string specifying a policy to recursively wrap layers with FSDP. If a string, it must be one of `transformer_based_wrap`, `size_based_wrap`, or `no_wrap`. "
                "Defaults to `NO_WRAP`. See `torch.distributed.fsdp.wrap.size_based_wrap_policy` for a direction on what it should look like"
            },
        )
    )
    cpu_offload: Union[
        bool,
        "torch.distributed.fsdp.CPUOffload",
        "torch.distributed.fsdp.CPUOffloadPolicy",
    ] = field(
        default=None,
        metadata={
            "help": "Whether to offload parameters to CPU. Should be either a `bool` or an instance of `torch.distributed.fsdp.fully_sharded_data_parallel.CPUOffload` or `torch.distributed.fsdp.fully_sharded_data_parallel.CPUOffloadPolicy` if `fsdp_version` is set to 2. Defaults to `False`"
        },
    )
    ignored_modules: Optional[Union[Iterable[torch.nn.Module], str]] = field(
        default=None,
        metadata={"help": "A list of modules to ignore when wrapping with FSDP."},
    )

    state_dict_type: Union[str, "torch.distributed.fsdp.StateDictType"] = field(
        default=None,
        metadata={
            "help": "State dict type to use. If a string, it must be one of `full_state_dict`, `local_state_dict`, or `sharded_state_dict`. Defaults to `FULL_STATE_DICT`"
        },
    )
    state_dict_config: Optional[
        Union[
            "torch.distributed.fsdp.FullStateDictConfig",
            "torch.distributed.fsdp.ShardedStateDictConfig",
        ]
    ] = field(
        default=None,
        metadata={"help": "State dict config to use. Is determined based on the `state_dict_type` if not passed in."},
    )
    optim_state_dict_config: Optional[
        Union[
            "torch.distributed.fsdp.FullOptimStateDictConfig",
            "torch.distributed.fsdp.ShardedOptimStateDictConfig",
        ]
    ] = field(
        default=None,
        metadata={
            "help": "Optim state dict config to use. Is determined based on the `state_dict_type` if not passed in."
        },
    )
    limit_all_gathers: bool = field(
        default=True,
        metadata={
            "help": "Whether to have FSDP explicitly synchronizes the CPU thread to prevent "
            "too many in-flight all-gathers. This bool only affects the sharded strategies that schedule all-gathers. "
            "Enabling this can help lower the number of CUDA malloc retries."
        },
    )
    use_orig_params: Optional[bool] = field(
        default=None,
        metadata={
            "help": "Whether to use the original parameters for the optimizer. Defaults to `False`. This becomes obsolete in FSDP2."
        },
    )
    param_init_fn: Optional[Callable[[torch.nn.Module], None]] = field(
        default=None,
        metadata={
            "help": "A Callable[torch.nn.Module] -> None that specifies how modules "
            "that are currently on the meta device should be initialized onto an actual device. "
            "Only applicable when `sync_module_states` is `True`. By default is a `lambda` which calls `to_empty` on the module."
        },
    )
    sync_module_states: Optional[bool] = field(
        default=None,
        metadata={
            "help": "Whether each individually wrapped FSDP unit should broadcast module parameters from rank 0 "
            "to ensure they are the same across all ranks after initialization. Defaults to `False` unless "
            "`cpu_ram_efficient_loading` is `True`, then will be forcibly enabled. This becomes obsolete in FSDP2."
        },
    )
    forward_prefetch: bool = field(
        default=None,
        metadata={
            "help": "Whether to have FSDP explicitly prefetches the next upcoming "
            "all-gather while executing in the forward pass. only use with Static graphs. Defaults to `False`"
        },
    )
    activation_checkpointing: bool = field(
        default=None,
        metadata={
            "help": "A technique to reduce memory usage by clearing activations of "
            "certain layers and recomputing them during a backward pass. Effectively, this trades extra computation time "
            "for reduced memory usage. Defaults to `False`"
        },
    )
    cpu_ram_efficient_loading: bool = field(
        default=None,
        metadata={
            "help": "If True, only the first process loads the pretrained model checkoint while all other processes have empty weights. "
            "Only applicable for 🤗 Transformers. When using this, `sync_module_states` needs to be `True`. Defaults to `False`."
        },
    )
    transformer_cls_names_to_wrap: Optional[list[str]] = field(
        default=None,
        metadata={
            "help": "A list of transformer layer class names to wrap. Only applicable when `auto_wrap_policy` is `transformer_based_wrap`."
        },
    )
    min_num_params: Optional[int] = field(
        default=None,
        metadata={
            "help": "The minimum number of parameters a module must have to be wrapped. Only applicable when `auto_wrap_policy` is `size_based_wrap`."
        },
    )

    def __post_init__(self):
        from torch.distributed.fsdp import BackwardPrefetch, ShardingStrategy

        _fsdp2_warnings = set()

        env_prefix = "FSDP_"
        # Strategy: By default we should always assume that values are passed in, else we check the environment variables
        if self.fsdp_version is None:
            self.fsdp_version = int(os.environ.get(env_prefix + "VERSION", "1"))

        if self.fsdp_version == 2:
            if not is_torch_version(">=", FSDP2_PYTORCH_VERSION):
                raise ImportError(f"FSDP2 requires PyTorch >= {FSDP2_PYTORCH_VERSION}")

        if self.sharding_strategy is not None:
            # We cannot properly detect all of the cases, as by default `args.fsdp_sharding_strategy` is set to `fully_shard`
            # Therefore we issue a warning only if the user has explicitly set it inside their plugin
            _fsdp2_warnings.add(
                "sharding_strategy is deprecated in favor of reshard_after_forward. "
                "This will be removed in a future version of Accelerate."
            )
        if self.fsdp_version == 1:
            if self.sharding_strategy is None:
                self.sharding_strategy = os.environ.get(env_prefix + "SHARDING_STRATEGY", "FULL_SHARD")
            if isinstance(self.sharding_strategy, str):
                if self.sharding_strategy.upper() in FSDP_SHARDING_STRATEGY:
                    self.sharding_strategy = FSDP_SHARDING_STRATEGY.index(self.sharding_strategy.upper()) + 1
                if isinstance(self.sharding_strategy, int) or self.sharding_strategy.isdigit():
                    self.sharding_strategy = ShardingStrategy(int(self.sharding_strategy))
                else:
                    self.sharding_strategy = ShardingStrategy[self.sharding_strategy.upper()]

        # Fallback to `reshard_after_forward` in FSDP1 if `sharding_strategy` is not set
        if self.reshard_after_forward is None and self.sharding_strategy is None:
            reshard_after_forward = os.environ.get(
                env_prefix + "RESHARD_AFTER_FORWARD",
                "true" if self.fsdp_version == 2 else "FULL_SHARD",
            )
            if self.fsdp_version == 2:
                self.reshard_after_forward = str_to_bool(reshard_after_forward.lower(), to_bool=True)
            else:
                self.reshard_after_forward = reshard_after_forward
        if isinstance(self.reshard_after_forward, str):
            if self.fsdp_version == 2:
                self.reshard_after_forward = str_to_bool(self.reshard_after_forward.lower(), to_bool=True)
            else:
                # We need to remap based on custom enum values for user readability
                if self.reshard_after_forward.upper() in FSDP_SHARDING_STRATEGY:
                    self.reshard_after_forward = FSDP_SHARDING_STRATEGY.index(self.reshard_after_forward.upper()) + 1
                if isinstance(self.reshard_after_forward, int) or self.reshard_after_forward.isdigit():
                    self.reshard_after_forward = ShardingStrategy(int(self.reshard_after_forward))
                else:
                    self.reshard_after_forward = ShardingStrategy[self.reshard_after_forward.upper()]

        if self.fsdp_version == 2 and not isinstance(self.reshard_after_forward, bool):
            raise ValueError(
                f"reshard_after_forward set to {self.reshard_after_forward}. This is not supported with FSDP2, please set to a `bool`"
            )
        if self.fsdp_version == 1 and isinstance(self.reshard_after_forward, bool):
            raise ValueError(
                f"reshard_after_forward set to {self.reshard_after_forward}. This is not supported with FSDP1, please set to a `str` or an instance of `torch.distributed.fsdp.fully_sharded_data_parallel.ShardingStrategy`"
            )

        if self.cpu_offload is None:
            self.cpu_offload = str_to_bool(os.environ.get(env_prefix + "OFFLOAD_PARAMS", "False")) == 1

        self.set_cpu_offload()  # abstracted away to hide imports due to version checks
        self.validate_cpu_offload()

        if self.backward_prefetch is None:
            self.backward_prefetch = os.environ.get(env_prefix + "BACKWARD_PREFETCH", None)
        if isinstance(self.backward_prefetch, str) and self.backward_prefetch.upper() == "NO_PREFETCH":
            self.backward_prefetch = None
        if self.backward_prefetch is not None and not isinstance(self.backward_prefetch, BackwardPrefetch):
            if isinstance(self.backward_prefetch, str) and self.backward_prefetch.upper() in FSDP_BACKWARD_PREFETCH:
                self.backward_prefetch = FSDP_BACKWARD_PREFETCH.index(self.backward_prefetch.upper()) + 1
            if isinstance(self.backward_prefetch, int) or self.backward_prefetch.isdigit():
                self.backward_prefetch = BackwardPrefetch(int(self.backward_prefetch))
            else:
                self.backward_prefetch = BackwardPrefetch[self.backward_prefetch.upper()]
        if self.fsdp_version == 2 and self.backward_prefetch is not None:
            _fsdp2_warnings.add("backward_prefetch is not supported in FSDP2. Setting backward prefetch to None.")
            self.backward_prefetch = None

        self.set_state_dict_type()

        if self.auto_wrap_policy is None:
            self.auto_wrap_policy = os.environ.get(env_prefix + "AUTO_WRAP_POLICY", "NO_WRAP")
        if isinstance(self.auto_wrap_policy, str):
            if self.auto_wrap_policy.upper() not in FSDP_AUTO_WRAP_POLICY:
                raise ValueError(
                    f"Invalid auto wrap policy: {self.auto_wrap_policy}. Must be one of {FSDP_AUTO_WRAP_POLICY}"
                )
            from torch.distributed.fsdp.wrap import (
                size_based_auto_wrap_policy,
                transformer_auto_wrap_policy,
            )

            if self.auto_wrap_policy.upper() == "TRANSFORMER_BASED_WRAP":
                self.auto_wrap_policy = transformer_auto_wrap_policy
                if self.transformer_cls_names_to_wrap is None:
                    self.transformer_cls_names_to_wrap = os.environ.get(env_prefix + "TRANSFORMER_CLS_TO_WRAP", None)
                if isinstance(self.transformer_cls_names_to_wrap, str):
                    self.transformer_cls_names_to_wrap = self.transformer_cls_names_to_wrap.split(",")
            elif self.auto_wrap_policy.upper() == "SIZE_BASED_WRAP":
                self.auto_wrap_policy = size_based_auto_wrap_policy
                if self.min_num_params is None:
                    self.min_num_params = int(os.environ.get(env_prefix + "MIN_NUM_PARAMS", 0))
                elif not isinstance(self.min_num_params, int):
                    raise ValueError(
                        f"`min_num_params` must be an integer. Got {self.min_num_params} of type {type(self.min_num_params)}"
                    )
            elif self.auto_wrap_policy.upper() == "NO_WRAP":
                self.auto_wrap_policy = None

        if self.use_orig_params is None and self.fsdp_version == 1:
            self.use_orig_params = str_to_bool(os.environ.get(env_prefix + "USE_ORIG_PARAMS", "False")) == 1
        if self.fsdp_version == 2 and self.use_orig_params is not None:
            _fsdp2_warnings.add("use_orig_params is obsolete in FSDP2, as FSDP2 always uses the original parameters.")
            self.use_orig_params = None

        if self.sync_module_states is None and self.fsdp_version == 1:
            self.sync_module_states = str_to_bool(os.environ.get(env_prefix + "SYNC_MODULE_STATES", "False")) == 1
        if self.fsdp_version == 2 and self.sync_module_states is not None:
            _fsdp2_warnings.add(
                "sync_module_states is obsolete in FSDP2, as it is not needed anymore."
                "Setting sync_module_states to None."
            )
            self.sync_module_states = None

        if self.forward_prefetch is None and self.fsdp_version == 1:
            self.forward_prefetch = str_to_bool(os.environ.get(env_prefix + "FORWARD_PREFETCH", "False")) == 1
        if self.fsdp_version == 2 and self.forward_prefetch is not None:
            raise ValueError("forward_prefetch is not yet implemented in FSDP2, set to None or use `fsdp_version=1`")

        if self.activation_checkpointing is None:
            self.activation_checkpointing = (
                str_to_bool(os.environ.get(env_prefix + "ACTIVATION_CHECKPOINTING", "False")) == 1
            )

        if self.ignored_modules is None:
            self.ignored_modules = os.environ.get(env_prefix + "IGNORED_MODULES", None)

        if self.cpu_ram_efficient_loading is None:
            self.cpu_ram_efficient_loading = (
                str_to_bool(os.environ.get(env_prefix + "CPU_RAM_EFFICIENT_LOADING", "False")) == 1
            )
        else:
            # We still need to set it for transformers
            os.environ[env_prefix + "CPU_RAM_EFFICIENT_LOADING"] = str(self.cpu_ram_efficient_loading)
        # There's no need to specify sync_module_states in FSDP2
        if self.fsdp_version == 1 and self.cpu_ram_efficient_loading and not self.sync_module_states:
            warnings.warn(
                "sync_module_states cannot be False since efficient cpu ram loading enabled. "
                "Setting sync_module_states to True."
            )
            self.sync_module_states = True
        if isinstance(self.mixed_precision_policy, str):
            # override is True since self.mixed_precision_policy is not None
            # has to be overwritten with the correct mixed precision object
            self.set_mixed_precision(self.mixed_precision_policy, override=True)
        elif isinstance(self.mixed_precision_policy, dict):
            self.set_mixed_precision(self.mixed_precision_policy)
        if self.mixed_precision_policy is not None:
            self.validate_mixed_precision_policy()

        if self.sync_module_states:
            if is_npu_available():
                device = torch.npu.current_device()
            elif is_mlu_available():
                device = torch.mlu.current_device()
            elif is_musa_available():
                device = torch.musa.current_device()
            elif is_cuda_available():
                device = torch.cuda.current_device()
            elif is_xpu_available():
                device = torch.xpu.current_device()
            elif is_hpu_available():
                device = torch.hpu.current_device()
            else:
                raise RuntimeError(
                    "There are currently no available devices found, must be one of 'XPU', 'CUDA', 'MLU', 'NPU', 'MUSA', or 'HPU'."
                )
            # Create a function that will be used to initialize the parameters of the model
            # when using `sync_module_states`
            self.param_init_fn = lambda x: x.to_empty(device=device, recurse=False)
        if is_torch_version("<", "2.7.0") and self.fsdp_version == 2 and self.ignored_modules is not None:
            _fsdp2_warnings.add(
                "FSDP2 ignored_params/ignored_modules is not available for torch version < 2.7.0"
                "Setting ignored_modules to None."
            )
            self.ignored_modules = None
        #  Single warning for all deprecation warnings due to FSDP2 conversion
        if _fsdp2_warnings:
            logger.warning("Multiple deprecation warnings due to FSDP2 conversion:\n".join(_fsdp2_warnings))

    def set_state_dict_type(self, state_dict_type=None):
        """
        Set the state dict config based on the `StateDictType`.
        """
        from torch.distributed.fsdp.fully_sharded_data_parallel import (
            FullOptimStateDictConfig,
            FullStateDictConfig,
            ShardedOptimStateDictConfig,
            ShardedStateDictConfig,
            StateDictType,
        )

        # Override the state_dict_type if provided, typical use case:
        # user trains with sharded, but final save is with full
        if state_dict_type is not None:
            self.state_dict_type = state_dict_type

        if self.state_dict_type is None:
            self.state_dict_type = os.environ.get(
                "FSDP_STATE_DICT_TYPE",
                "FULL_STATE_DICT" if self.fsdp_version == 1 else "SHARDED_STATE_DICT",
            )
        if isinstance(self.state_dict_type, str):
            if self.state_dict_type.isdigit():
                self.state_dict_type = StateDictType(int(self.state_dict_type))
            else:
                self.state_dict_type = StateDictType[self.state_dict_type.upper()]

        if self.state_dict_type == StateDictType.FULL_STATE_DICT:
            if self.state_dict_config is None:
                self.state_dict_config = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
            if self.optim_state_dict_config is None:
                self.optim_state_dict_config = FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=True)
        elif self.state_dict_type == StateDictType.SHARDED_STATE_DICT:
            if self.state_dict_config is None:
                self.state_dict_config = ShardedStateDictConfig(offload_to_cpu=True)
            if self.optim_state_dict_config is None:
                self.optim_state_dict_config = ShardedOptimStateDictConfig(offload_to_cpu=True)

        if self.fsdp_version == 2 and self.state_dict_type == StateDictType.LOCAL_STATE_DICT:
            raise ValueError(
                "FSDP2 does not support LOCAL_STATE_DICT. "
                "Please set `fsdp_state_dict_type` to `SHARDED_STATE_DICT` or `FULL_STATE_DICT`."
            )

    def set_auto_wrap_policy(self, model):
        """
        Given `model`, creates an `auto_wrap_policy` based on the passed in policy and if we can use the
        `transformer_cls_to_wrap`
        """
        from torch.distributed.fsdp.wrap import (
            size_based_auto_wrap_policy,
            transformer_auto_wrap_policy,
        )

        # First base off of `_no_split_modules`
        no_split_modules = getattr(model, "_no_split_modules", None)
        default_transformer_cls_names_to_wrap = list(no_split_modules) if no_split_modules is not None else []
        if self.auto_wrap_policy == transformer_auto_wrap_policy:
            if self.transformer_cls_names_to_wrap is None:
                self.transformer_cls_names_to_wrap = default_transformer_cls_names_to_wrap
            transformer_cls_to_wrap = set()
            for layer_class in self.transformer_cls_names_to_wrap:
                transformer_cls = get_module_class_from_name(model, layer_class)
                if transformer_cls is None:
                    raise ValueError(f"Could not find the transformer layer class {layer_class} in the model.")
                transformer_cls_to_wrap.add(transformer_cls)
            # Finally we set the auto_wrap_policy to a callable
            self.auto_wrap_policy = functools.partial(
                self.auto_wrap_policy, transformer_layer_cls=transformer_cls_to_wrap
            )

        elif self.auto_wrap_policy == size_based_auto_wrap_policy:
            # If zero, we silently ignore it.
            if self.min_num_params > 0:
                self.auto_wrap_policy = functools.partial(self.auto_wrap_policy, min_num_params=self.min_num_params)
            else:
                self.auto_wrap_policy = None

    def set_mixed_precision(self, mixed_precision, buffer_autocast=False, override=False):
        "Sets the mixed precision policy for FSDP"
        mixed_precision_mapping = {
            "fp8": torch.bfloat16,
            "fp16": torch.float16,
            "bf16": torch.bfloat16,
            "fp32": torch.float32,
        }
        dtype = mixed_precision
        if isinstance(mixed_precision, str):
            dtype = mixed_precision_mapping.get(mixed_precision, None)
            if dtype is None:
                raise ValueError(
                    f"Invalid mixed precision: {mixed_precision}. Must be one of {list(mixed_precision_mapping.keys())}"
                )
        elif isinstance(mixed_precision, torch.dtype) and mixed_precision not in mixed_precision_mapping.values():
            raise ValueError(
                f"Invalid mixed precision: {mixed_precision}. Must be one of {list(mixed_precision_mapping.values())}"
            )

        buffer_type = torch.float32 if buffer_autocast else dtype

        if self.fsdp_version == 1:
            from torch.distributed.fsdp import MixedPrecision
        elif self.fsdp_version == 2:
            from torch.distributed.fsdp import MixedPrecisionPolicy as MixedPrecision

        if override or self.mixed_precision_policy is None:
            dtype_args = {"param_dtype": dtype, "reduce_dtype": dtype}
            if self.fsdp_version == 1:
                dtype_args["buffer_dtype"] = buffer_type
            else:
                dtype_args["output_dtype"] = dtype
            # TODO(s1ro1): `cast_forward_inputs` for FSDP2?
            self.mixed_precision_policy = MixedPrecision(**dtype_args)
        elif isinstance(self.mixed_precision_policy, dict):
            # Check for incompatible types
            valid_keys = ["param_dtype", "reduce_dtype"] + (
                ["buffer_dtype"] if self.fsdp_version == 1 else ["output_dtype"]
            )
            missing_keys = [k for k in valid_keys if k not in self.mixed_precision_policy]
            invalid_values = [
                k for k, v in self.mixed_precision_policy.items() if v not in mixed_precision_mapping.values()
            ]
            if missing_keys or invalid_values:
                raise ValueError(
                    f"Invalid mixed precision policy: {self.mixed_precision_policy}. "
                    f"Must be a `dict` with keys {valid_keys}."
                    f"Values must be one of {list(mixed_precision_mapping.values())}"
                )
            self.mixed_precision_policy = MixedPrecision(**self.mixed_precision_policy)

    def validate_mixed_precision_policy(self):
        """
        Validates the mixed precision policy, abstracted away to not bring in the imports if not needed.
        """
        if self.fsdp_version == 2:
            from torch.distributed.fsdp import MixedPrecisionPolicy as MixedPrecision
        else:
            from torch.distributed.fsdp import MixedPrecision

        if not isinstance(self.mixed_precision_policy, MixedPrecision):
            required_type = (
                "`torch.distributed.fsdp.MixedPrecisionPolicy`"
                if self.fsdp_version == 2
                else "`torch.distributed.fsdp.MixedPrecision`"
            )
            raise ValueError(f"mixed_precision_policy must be an instance of {required_type}.")

    def set_cpu_offload(self):
        if self.fsdp_version == 2:
            from torch.distributed.fsdp import CPUOffloadPolicy, OffloadPolicy
        else:
            from torch.distributed.fsdp import CPUOffload

        if isinstance(self.cpu_offload, bool):
            if self.fsdp_version == 2:
                if not self.cpu_offload:
                    self.cpu_offload = OffloadPolicy()
                else:
                    self.cpu_offload = CPUOffloadPolicy()
            else:
                self.cpu_offload = CPUOffload(offload_params=self.cpu_offload)

    def validate_cpu_offload(self):
        if self.fsdp_version == 2:
            from torch.distributed.fsdp import OffloadPolicy
        else:
            from torch.distributed.fsdp import CPUOffload

        if self.fsdp_version == 2 and not isinstance(self.cpu_offload, OffloadPolicy):
            raise ValueError(
                f"`cpu_offload` must be an instance of `torch.distributed.fsdp.OffloadPolicy` in FSDP2, got {self.cpu_offload}"
            )
        if self.fsdp_version == 1 and not isinstance(self.cpu_offload, CPUOffload):
            raise ValueError(
                f"`cpu_offload` must be an instance of `torch.distributed.fsdp.CPUOffload` in FSDP1, got {self.cpu_offload}"
            )


@dataclass
class TorchTensorParallelPlugin:
    """
    This plugin is used to enable tensor parallelism using PyTorch >= 2.0.
    """

    tp_size: int = field(
        default=1,
        metadata={"help": "tensor parallel size will be used in the device mesh preparation"},
    )

    # torch_device_mesh is of type "torch.distributed.DeviceMesh"
    torch_device_mesh: Optional["torch.distributed.DeviceMesh"] = field(default=None)


@dataclass
class TorchContextParallelConfig:
    """
    This class holds the configuration for context parallelism in PyTorch.
    """

    cp_comm_strategy: Optional[str] = field(
        default=None,
        metadata={
            "help": "Communication strategy for context parallelism. Can be one of 'allgather' or 'alltoall'. Defaults to 'allgather'."
        },
    )

    def __post_init__(self):
        if not is_torch_version(">=", BETA_CP_AVAILABLE_PYTORCH_VERSION):
            raise ValueError(
                f"FSDP2-based Context parallelism is only available in PyTorch {BETA_CP_AVAILABLE_PYTORCH_VERSION} and later versions. "
                "Please upgrade your PyTorch version."
            )

        if self.cp_comm_strategy is None:
            self.cp_comm_strategy = os.environ.get("PARALLELISM_CONFIG_CP_COMM_STRATEGY", "allgather")
        if self.cp_comm_strategy not in ["allgather", "alltoall"]:
            raise ValueError(
                f"Invalid cp_comm_strategy: {self.cp_comm_strategy}. Must be one of 'allgather' or 'alltoall'."
            )


@dataclass
class DeepSpeedSequenceParallelConfig:
    sp_seq_length: Optional[int] = field(
        default=None,
        metadata={
            "help": "Sequence length for when batches are all of the same length. For variable sequence lengths across batches set `sp_seq_length_is_variable=True` and leave this field unset"
        },
    )
    sp_seq_length_is_variable: Optional[bool] = field(
        default=None,
        metadata={
            "help": "If `True` will work with a sequence length that may change between batches, in which case `sp_seq_length` value can be set to anything divisible by cp size or remain unset. If `False` then `sp_seq_length` needs to match the batch's sequence length dimension. The default is `True`."
        },
    )
    sp_attn_implementation: Optional[str] = field(
        default=None,
        metadata={
            "help": "Attention implementation to use. Can be one of 'flash_attention_2', 'flash_attention_3', 'sdpa', or a hub-hosted kernel (e.g. 'kernels-community/flash-attn2'). Defaults to `sdpa`."
        },
    )

    def __post_init__(self):
        # sp_seq_length_is_variable and sp_seq_length are interconnected
        if self.sp_seq_length_is_variable is None:
            self.sp_seq_length_is_variable = (
                os.environ.get("PARALLELISM_CONFIG_SP_SEQ_LENGTH_IS_VARIABLE", "true").lower() == "true"
            )

        if not self.sp_seq_length_is_variable and self.sp_seq_length is None:
            if "PARALLELISM_CONFIG_SP_SEQ_LENGTH" not in os.environ:
                raise ValueError(
                    "when `sp_seq_length_is_variable` is `False` `sp_seq_length` must be provided either through the constructor or the environment variable PARALLELISM_CONFIG_SP_SEQ_LENGTH"
                )
            else:
                self.sp_seq_length = os.environ.get("PARALLELISM_CONFIG_SP_SEQ_LENGTH")
                self.sp_seq_length = None if self.sp_seq_length == "None" else int(self.sp_seq_length)

        if self.sp_attn_implementation is None:
            self.sp_attn_implementation = os.environ.get("PARALLELISM_CONFIG_SP_ATTN_IMPLEMENTATION", None)

        _builtin_sp_attn = ["flash_attention_2", "flash_attention_3", "sdpa"]
        # Also allow hub-hosted flash attention kernels (e.g. "kernels-community/flash-attn2").
        # These register into transformers' ALL_ATTENTION_FUNCTIONS at model load time and
        # DeepSpeed validates against that registry directly.
        _unsupported_sp_attn = ["eager", "flex_attention"]
        if self.sp_attn_implementation is not None:
            if self.sp_attn_implementation in _unsupported_sp_attn:
                raise ValueError(
                    f"Invalid sp_attn_implementation: {self.sp_attn_implementation}. "
                    f"'eager' and 'flex_attention' are not supported with sequence parallelism."
                )
            if self.sp_attn_implementation not in _builtin_sp_attn:
                if "/" not in self.sp_attn_implementation or "flash-attn" not in self.sp_attn_implementation:
                    raise ValueError(
                        f"Invalid sp_attn_implementation: {self.sp_attn_implementation}. "
                        f"Must be one of {_builtin_sp_attn} or a hub-hosted flash attention kernel "
                        f"(e.g. 'kernels-community/flash-attn2')."
                    )


@dataclass
class TorchTensorParallelConfig:
    """
    Use this object in your [`Accelerator`] to customize your torch tensor parallelism.
    """

    enable_async_tp: bool = False

    def __post_init__(self):
        if not is_torch_version(">=", BETA_TP_AVAILABLE_PYTORCH_VERSION):
            raise ValueError(
                f"Torch tensor parallelism is only available in PyTorch {BETA_TP_AVAILABLE_PYTORCH_VERSION} and later versions. "
                "Please upgrade your PyTorch version."
            )

        if not compare_versions("transformers", ">=", BETA_TP_AVAILABLE_TRANSFORMERS_VERSION):
            raise ValueError(f"TP requires transformers >= {BETA_TP_AVAILABLE_TRANSFORMERS_VERSION}")

        if self.enable_async_tp:
            warnings.warn("Async tensor parallelism is currently not supported, ignoring this option.")


@dataclass
class MegatronLMPlugin:
    """
    Plugin for Megatron-LM to enable tensor, pipeline, sequence and data parallelism. Also to enable selective
    activation recomputation and optimized fused kernels.

    Args:
        tp_degree (`int`, defaults to `None`):
            Tensor parallelism degree.
        pp_degree (`int`, defaults to `None`):
            Pipeline parallelism degree.
        num_micro_batches (`int`, defaults to `None`):
            Number of micro-batches.
        gradient_clipping (`float`, defaults to `None`):
            Gradient clipping value based on global L2 Norm (0 to disable).
        sequence_parallelism (`bool`, defaults to `None`):
            Enable sequence parallelism.
        recompute_activations (`bool`, defaults to `None`):
            Enable selective activation recomputation.
        use_distributed_optimizr (`bool`, defaults to `None`):
            Enable distributed optimizer.
        pipeline_model_parallel_split_rank (`int`, defaults to `None`):
            Rank where encoder and decoder should be split.
        num_layers_per_virtual_pipeline_stage (`int`, defaults to `None`):
            Number of layers per virtual pipeline stage.
        is_train_batch_min (`str`, defaults to `True`):
            If both tran & eval dataloaders are specified, this will decide the `micro_batch_size`.
        train_iters (`int`, defaults to `None`):
            Total number of samples to train over all training runs. Note that either train-iters or train-samples
            should be provided when using `MegatronLMDummyScheduler`.
        train_samples (`int`, defaults to `None`):
            Total number of samples to train over all training runs. Note that either train-iters or train-samples
            should be provided when using `MegatronLMDummyScheduler`.
        weight_decay_incr_style (`str`, defaults to `'constant'`):
            Weight decay increment function. choices=["constant", "linear", "cosine"].
        start_weight_decay (`float`, defaults to `None`):
            Initial weight decay coefficient for L2 regularization.
        end_weight_decay (`float`, defaults to `None`):
            End of run weight decay coefficient for L2 regularization.
        lr_decay_style (`str`, defaults to `'linear'`):
            Learning rate decay function. choices=['constant', 'linear', 'cosine'].
        lr_decay_iters (`int`, defaults to `None`):
            Number of iterations for learning rate decay. If None defaults to `train_iters`.
        lr_decay_samples (`int`, defaults to `None`):
            Number of samples for learning rate decay. If None defaults to `train_samples`.
        lr_warmup_iters (`int`, defaults to `None`):
            Number of iterations to linearly warmup learning rate over.
        lr_warmup_samples (`int`, defaults to `None`):
            Number of samples to linearly warmup learning rate over.
        lr_warmup_fraction (`float`, defaults to `None`):
            Fraction of lr-warmup-(iters/samples) to linearly warmup learning rate over.
        min_lr (`float`, defaults to `0`):
            Minimum value for learning rate. The scheduler clip values below this threshold.
        consumed_samples (`List`, defaults to `None`):
            Number of samples consumed in the same order as the dataloaders to `accelerator.prepare` call.
        no_wd_decay_cond (`Optional`, defaults to `None`):
            Condition to disable weight decay.
        scale_lr_cond (`Optional`, defaults to `None`):
            Condition to scale learning rate.
        lr_mult (`float`, defaults to `1.0`):
            Learning rate multiplier.
        megatron_dataset_flag (`bool`, defaults to `False`):
            Whether the format of dataset follows Megatron-LM Indexed/Cached/MemoryMapped format.
        seq_length (`int`, defaults to `None`):
            Maximum sequence length to process.
        encoder_seq_length (`int`, defaults to `None`):
            Maximum sequence length to process for the encoder.
        decoder_seq_length (`int`, defaults to `None`):
            Maximum sequence length to process for the decoder.
        tensorboard_dir (`str`, defaults to `None`):
            Path to save tensorboard logs.
        set_all_logging_options (`bool`, defaults to `False`):
            Whether to set all logging options.
        eval_iters (`int`, defaults to `100`):
            Number of iterations to run for evaluation validation/test for.
        eval_interval (`int`, defaults to `1000`):
            Interval between running evaluation on validation set.
        return_logits (`bool`, defaults to `False`):
            Whether to return logits from the model.
        custom_train_step_class (`Optional`, defaults to `None`):
            Custom train step class.
        custom_train_step_kwargs (`Optional`, defaults to `None`):
            Custom train step kwargs.
        custom_model_provider_function (`Optional`, defaults to `None`):
            Custom model provider function.
        custom_prepare_model_function (`Optional`, defaults to `None`):
            Custom prepare model function.
        custom_megatron_datasets_provider_function (`Optional`, defaults to `None`):
            Custom megatron train_valid_test datasets provider function.
        custom_get_batch_function (`Optional`, defaults to `None`):
            Custom get batch function.
        custom_loss_function (`Optional`, defaults to `None`):
            Custom loss function.
        other_megatron_args (`Optional`, defaults to `None`):
            Other Megatron-LM arguments. Please refer Megatron-LM.
    """

    tp_degree: int = field(default=None, metadata={"help": "tensor parallelism degree."})
    pp_degree: int = field(default=None, metadata={"help": "pipeline parallelism degree."})
    use_custom_fsdp: bool = field(default=None, metadata={"help": "use custom fsdp."})
    overlap_cpu_optimizer_d2h_h2d: bool = field(
        default=None, metadata={"help": "overlap CPU optimizer step, gradients D2H and updated parameters H2D."}
    )
    no_load_optim: bool = field(default=None, metadata={"help": "do not load optimizer."})
    eod_mask_loss: bool = field(default=None, metadata={"help": "use eod mask loss."})
    no_save_optim: bool = field(default=None, metadata={"help": "do not save optimizer."})
    optimizer_cpu_offload: bool = field(default=None, metadata={"help": "use CPU offload for optimizer."})
    use_precision_aware_optimizer: bool = field(default=None, metadata={"help": "use precision aware optimizer."})
    decoder_last_pipeline_num_layers: int = field(
        default=None,
        metadata={
            "help": "decoder last pipeline number of layers, default None is even split of transformer layers across all pipeline stages."
        },
    )
    recompute_granularity: str = field(default=None, metadata={"help": "recompute granularity (full, selective)."})
    recompute_method: str = field(default=None, metadata={"help": "recompute method (uniform, block)."})
    recompute_num_layers: int = field(default=None, metadata={"help": "number of layers to recompute."})
    attention_backend: bool = field(default=None, metadata={"help": "enable attention backend."})
    expert_model_parallel_size: int = field(default=None, metadata={"help": "expert model parallel size."})
    context_parallel_size: int = field(default=None, metadata={"help": "context parallel size."})
    attention_dropout: float = field(default=None, metadata={"help": "attention dropout rate."})
    hidden_dropout: float = field(default=None, metadata={"help": "hidden dropout rate."})
    attention_softmax_in_fp32: bool = field(default=None, metadata={"help": "use fp32 for attention softmax."})
    expert_tensor_parallel_size: int = field(default=None, metadata={"help": "expert tensor parallel size."})
    calculate_per_token_loss: bool = field(default=None, metadata={"help": "calculate per token loss."})
    use_rotary_position_embeddings: bool = field(default=None, metadata={"help": "use rotary position embeddings."})
    num_micro_batches: int = field(default=None, metadata={"help": "number of micro-batches."})
    gradient_clipping: float = field(
        default=None,
        metadata={"help": "gradient clipping value based on global L2 Norm (0 to disable)"},
    )
    sequence_parallelism: bool = field(
        default=None,
        metadata={"help": "enable sequence parallelism"},
    )
    recompute_activations: bool = field(
        default=None,
        metadata={"help": "enable selective activation recomputation"},
    )
    use_distributed_optimizer: bool = field(
        default=None,
        metadata={"help": "enable distributed optimizer"},
    )
    pipeline_model_parallel_split_rank: int = field(
        default=None,
        metadata={"help": "Rank where encoder and decoder should be split."},
    )
    num_layers_per_virtual_pipeline_stage: int = field(
        default=None, metadata={"help": "Number of layers per virtual pipeline stage."}
    )
    is_train_batch_min: str = field(
        default=True,
        metadata={"help": "If both train & eval dataloaders are specified, this will decide the micro_batch_size"},
    )
    train_iters: int = field(
        default=None,
        metadata={
            "help": "Total number of iterations to train over all training runs. "
            "Note that either train-iters or train-samples should be provided when using `MegatronLMDummyScheduler`"
        },
    )
    train_samples: int = field(
        default=None,
        metadata={
            "help": "Total number of samples to train over all training runs. "
            "Note that either train-iters or train-samples should be provided when using `MegatronLMDummyScheduler`"
        },
    )
    weight_decay_incr_style: str = field(
        default="constant",
        metadata={"help": 'Weight decay increment function. choices=["constant", "linear", "cosine"]. '},
    )
    start_weight_decay: float = field(
        default=None,
        metadata={"help": "Initial weight decay coefficient for L2 regularization."},
    )
    end_weight_decay: float = field(
        default=None,
        metadata={"help": "End of run weight decay coefficient for L2 regularization."},
    )
    lr_decay_style: str = field(
        default="linear",
        metadata={"help": "Learning rate decay function. choices=['constant', 'linear', 'cosine']."},
    )
    lr_decay_iters: int = field(
        default=None,
        metadata={"help": "Number of iterations for learning rate decay. If None defaults to `train_iters`."},
    )
    lr_decay_samples: int = field(
        default=None,
        metadata={"help": "Number of samples for learning rate decay. If None defaults to `train_samples`."},
    )
    lr_warmup_iters: int = field(
        default=None,
        metadata={"help": "number of iterations to linearly warmup learning rate over."},
    )
    lr_warmup_samples: int = field(
        default=None,
        metadata={"help": "number of samples to linearly warmup learning rate over."},
    )
    lr_warmup_fraction: float = field(
        default=None,
        metadata={"help": "fraction of lr-warmup-(iters/samples) to linearly warmup learning rate over."},
    )
    min_lr: float = field(
        default=0,
        metadata={"help": "Minimum value for learning rate. The scheduler clip values below this threshold."},
    )
    consumed_samples: list[int] = field(
        default=None,
        metadata={
            "help": "Number of samples consumed in the same order as the dataloaders to `accelerator.prepare` call."
        },
    )
    no_wd_decay_cond: Optional[Callable] = field(default=None, metadata={"help": "Condition to disable weight decay."})
    scale_lr_cond: Optional[Callable] = field(default=None, metadata={"help": "Condition to scale learning rate."})
    lr_mult: float = field(default=1.0, metadata={"help": "Learning rate multiplier."})
    megatron_dataset_flag: bool = field(
        default=False,
        metadata={"help": "Whether the format of dataset follows Megatron-LM Indexed/Cached/MemoryMapped format."},
    )
    seq_length: int = field(
        default=None,
        metadata={"help": "Maximum sequence length to process."},
    )
    encoder_seq_length: int = field(
        default=None,
        metadata={"help": "Maximum sequence length to process for the encoder."},
    )
    decoder_seq_length: int = field(
        default=None,
        metadata={"help": "Maximum sequence length to process for the decoder."},
    )
    tensorboard_dir: str = field(
        default=None,
        metadata={"help": "Path to save tensorboard logs."},
    )
    set_all_logging_options: bool = field(
        default=False,
        metadata={"help": "Whether to set all logging options."},
    )
    eval_iters: int = field(
        default=100,
        metadata={"help": "Number of iterations to run for evaluation validation/test for."},
    )
    eval_interval: int = field(
        default=1000,
        metadata={"help": "Interval between running evaluation on validation set."},
    )
    return_logits: bool = field(
        default=False,
        metadata={"help": "Whether to return logits from the model."},
    )

    # custom train step args
    custom_train_step_class: Optional[Any] = field(
        default=None,
        metadata={"help": "Custom train step class."},
    )
    custom_train_step_kwargs: Optional[dict[str, Any]] = field(
        default=None,
        metadata={"help": "Custom train step kwargs."},
    )

    # custom model args
    custom_model_provider_function: Optional[Callable] = field(
        default=None,
        metadata={"help": "Custom model provider function."},
    )
    custom_prepare_model_function: Optional[Callable] = field(
        default=None,
        metadata={"help": "Custom prepare model function."},
    )
    custom_megatron_datasets_provider_function: Optional[Callable] = field(
        default=None,
        metadata={"help": "Custom megatron train_valid_test datasets provider function."},
    )
    custom_get_batch_function: Optional[Callable] = field(
        default=None,
        metadata={"help": "Custom get batch function."},
    )
    custom_loss_function: Optional[Callable] = field(
        default=None,
        metadata={"help": "Custom loss function."},
    )

    # remaining args such as enabling Alibi/ROPE positional embeddings,
    # wandb logging, Multi-Query Attention, etc.
    other_megatron_args: Optional[dict[str, Any]] = field(
        default=None,
        metadata={"help": "Other Megatron-LM arguments. Please refer Megatron-LM"},
    )

    def __post_init__(self):
        prefix = "MEGATRON_LM_"
        if self.tp_degree is None:
            self.tp_degree = int(os.environ.get(prefix + "TP_DEGREE", 1))
        if self.pp_degree is None:
            self.pp_degree = int(os.environ.get(prefix + "PP_DEGREE", 1))
        if self.use_custom_fsdp is None:
            self.use_custom_fsdp = str_to_bool(os.environ.get(prefix + "USE_CUSTOM_FSDP", "False")) == 1
        if self.no_load_optim is None:
            self.no_load_optim = str_to_bool(os.environ.get(prefix + "NO_LOAD_OPTIM", "False")) == 1
        if self.eod_mask_loss is None:
            self.eod_mask_loss = str_to_bool(os.environ.get(prefix + "EOD_MASK_LOSS", "False")) == 1
        if self.no_save_optim is None:
            self.no_save_optim = str_to_bool(os.environ.get(prefix + "NO_SAVE_OPTIM", "False")) == 1
        if self.optimizer_cpu_offload is None:
            self.optimizer_cpu_offload = str_to_bool(os.environ.get(prefix + "OPTIMIZER_CPU_OFFLOAD", "False")) == 1
        if self.overlap_cpu_optimizer_d2h_h2d is None:
            self.overlap_cpu_optimizer_d2h_h2d = (
                str_to_bool(os.environ.get(prefix + "OVERLAP_CPU_OPTIMIZER_D2H_H2D", "False")) == 1
            )
        if self.use_precision_aware_optimizer is None:
            self.use_precision_aware_optimizer = (
                str_to_bool(os.environ.get(prefix + "USE_PRECISION_AWARE_OPTIMIZER", "False")) == 1
            )
        if self.decoder_last_pipeline_num_layers is None:
            if os.environ.get(prefix + "DECODER_LAST_PIPELINE_NUM_LAYERS") is not None:
                self.decoder_last_pipeline_num_layers = int(
                    os.environ.get(prefix + "DECODER_LAST_PIPELINE_NUM_LAYERS", 0)
                )
            else:
                self.decoder_last_pipeline_num_layers = None
        if self.num_micro_batches is None:
            self.num_micro_batches = int(os.environ.get(prefix + "NUM_MICRO_BATCHES", 1))
        if self.gradient_clipping is None:
            self.gradient_clipping = float(os.environ.get(prefix + "GRADIENT_CLIPPING", 1.0))
        if self.recompute_activations is None:
            self.recompute_activations = str_to_bool(os.environ.get(prefix + "RECOMPUTE_ACTIVATIONS", "False")) == 1
        if self.use_distributed_optimizer is None:
            self.use_distributed_optimizer = (
                str_to_bool(os.environ.get(prefix + "USE_DISTRIBUTED_OPTIMIZER", "False")) == 1
            )
        if self.sequence_parallelism is None:
            self.sequence_parallelism = str_to_bool(os.environ.get(prefix + "SEQUENCE_PARALLELISM", "False")) == 1
        if self.recompute_granularity is None:
            self.recompute_granularity = os.environ.get(prefix + "RECOMPUTE_GRANULARITY", "full")
        if self.recompute_method is None:
            self.recompute_method = os.environ.get(prefix + "RECOMPUTE_METHOD", "uniform")
        if self.recompute_num_layers is None:
            self.recompute_num_layers = int(os.environ.get(prefix + "RECOMPUTE_NUM_LAYERS", 1))
        if self.attention_backend is None:
            self.attention_backend = str_to_bool(os.environ.get(prefix + "ATTENTION_BACKEND", "True")) == 1
        if self.expert_model_parallel_size is None:
            self.expert_model_parallel_size = int(os.environ.get(prefix + "EXPERT_MODEL_PARALLEL_SIZE", 1))
        if self.context_parallel_size is None:
            self.context_parallel_size = int(os.environ.get(prefix + "CONTEXT_PARALLEL_SIZE", 2))
        if self.attention_dropout is None:
            self.attention_dropout = float(os.environ.get(prefix + "ATTENTION_DROPOUT", "0.0"))
        if self.hidden_dropout is None:
            self.hidden_dropout = float(os.environ.get(prefix + "HIDDEN_DROPOUT", "0.0"))
        if self.attention_softmax_in_fp32 is None:
            self.attention_softmax_in_fp32 = (
                str_to_bool(os.environ.get(prefix + "ATTENTION_SOFTMAX_IN_FP32", "True")) == 1
            )
        if self.expert_tensor_parallel_size is None:
            self.expert_tensor_parallel_size = int(os.environ.get(prefix + "EXPERT_TENSOR_PARALLEL_SIZE", 1))
        if self.calculate_per_token_loss is None:
            self.calculate_per_token_loss = (
                str_to_bool(os.environ.get(prefix + "CALCULATE_PER_TOKEN_LOSS", "True")) == 1
            )
        if self.use_rotary_position_embeddings is None:
            self.use_rotary_position_embeddings = (
                str_to_bool(os.environ.get(prefix + "USE_ROTARY_POSITION_EMBEDDINGS", "True")) == 1
            )

        if self.pp_degree > 1 or self.use_distributed_optimizer:
            self.DDP_impl = "local"
        else:
            self.DDP_impl = "torch"

        if self.consumed_samples is not None:
            if len(self.consumed_samples) == 1:
                self.consumed_samples.extend([0, 0])
            elif len(self.consumed_samples) == 2:
                self.consumed_samples.append(0)

        self.megatron_lm_default_args = {
            "tensor_model_parallel_size": self.tp_degree,
            "pipeline_model_parallel_size": self.pp_degree,
            "pipeline_model_parallel_split_rank": self.pipeline_model_parallel_split_rank,
            "num_layers_per_virtual_pipeline_stage": self.num_layers_per_virtual_pipeline_stage,
            "DDP_impl": self.DDP_impl,
            "use_distributed_optimizer": self.use_distributed_optimizer,
            "sequence_parallel": self.sequence_parallelism,
            "clip_grad": self.gradient_clipping,
            "num_micro_batches": self.num_micro_batches,
            "consumed_samples": self.consumed_samples,
            "no_wd_decay_cond": self.no_wd_decay_cond,
            "scale_lr_cond": self.scale_lr_cond,
            "lr_mult": self.lr_mult,
            "megatron_dataset_flag": self.megatron_dataset_flag,
            "eval_iters": self.eval_iters,
            "eval_interval": self.eval_interval,
            "use_custom_fsdp": self.use_custom_fsdp,
            "no_load_optim": self.no_load_optim,
            "eod_mask_loss": self.eod_mask_loss,
            "no_save_optim": self.no_save_optim,
            "optimizer_cpu_offload": self.optimizer_cpu_offload,
            "overlap_cpu_optimizer_d2h_h2d": self.overlap_cpu_optimizer_d2h_h2d,
            "use_precision_aware_optimizer": self.use_precision_aware_optimizer,
            "decoder_last_pipeline_num_layers": self.decoder_last_pipeline_num_layers,
            "recompute_granularity": self.recompute_granularity,
            "recompute_method": self.recompute_method,
            "recompute_num_layers": self.recompute_num_layers,
            "attention_backend": self.attention_backend,
            "expert_model_parallel_size": self.expert_model_parallel_size,
            "context_parallel_size": self.context_parallel_size,
            "attention_dropout": self.attention_dropout,
            "hidden_dropout": self.hidden_dropout,
            "attention_softmax_in_fp32": self.attention_softmax_in_fp32,
            "expert_tensor_parallel_size": self.expert_tensor_parallel_size,
            "calculate_per_token_loss": self.calculate_per_token_loss,
            "use_rotary_position_embeddings": self.use_rotary_position_embeddings,
        }
        if self.tensorboard_dir is not None:
            self.megatron_lm_default_args["tensorboard_dir"] = self.tensorboard_dir
            if self.set_all_logging_options:
                self.set_tensorboard_logging_options()
        if self.other_megatron_args is not None:
            self.megatron_lm_default_args.update(self.other_megatron_args)

    def set_network_size_args(self, model, batch_data=None):
        model_config_type = model.config.model_type.lower()
        for model_type in MODEL_CONFIGS_TO_MEGATRON_PARSERS.keys():
            if model_type in model_config_type:
                MODEL_CONFIGS_TO_MEGATRON_PARSERS[model_type](self, model, batch_data)
                return
        raise ValueError(
            f"Accelerate Megatron-LM integration not supports {model_config_type} model. "
            "You can add your own model config parser."
        )

    def set_mixed_precision(self, mixed_precision):
        if mixed_precision == "fp16":
            self.megatron_lm_default_args["fp16"] = True
        elif mixed_precision == "bf16":
            self.megatron_lm_default_args["bf16"] = True
            self.DDP_impl = "local"
            self.megatron_lm_default_args["DDP_impl"] = self.DDP_impl

    def set_training_args(self, micro_batch_size, dp_degree):
        self.data_parallel_size = dp_degree
        self.micro_batch_size = micro_batch_size
        self.global_batch_size = dp_degree * micro_batch_size * self.num_micro_batches
        self.megatron_lm_default_args["data_parallel_size"] = self.data_parallel_size
        self.megatron_lm_default_args["micro_batch_size"] = self.micro_batch_size
        self.megatron_lm_default_args["global_batch_size"] = self.global_batch_size

    def set_optimizer_type(self, optimizer):
        optimizer_name = optimizer.__class__.__name__.lower()
        if "adam" in optimizer_name:
            self.megatron_lm_default_args["optimizer"] = "adam"
            self.megatron_lm_default_args["adam_beta1"] = optimizer.defaults["betas"][0]
            self.megatron_lm_default_args["adam_beta2"] = optimizer.defaults["betas"][1]
            self.megatron_lm_default_args["adam_eps"] = optimizer.defaults["eps"]
        elif "sgd" in optimizer_name:
            self.megatron_lm_default_args["optimizer"] = "sgd"
            self.megatron_lm_default_args["sgd_momentum"] = optimizer.defaults["momentum"]
        else:
            raise ValueError(f"Optimizer {optimizer_name} is not supported by Megatron-LM")

        self.megatron_lm_default_args["lr"] = optimizer.defaults["lr"]
        self.megatron_lm_default_args["weight_decay"] = optimizer.defaults["weight_decay"]

    def set_scheduler_args(self, scheduler):
        if self.train_iters is None:
            self.train_iters = scheduler.total_num_steps // self.megatron_lm_default_args["data_parallel_size"]
            if self.train_samples is not None:
                self.train_samples = None
                warnings.warn(
                    "Ignoring `train_samples` as `train_iters` based on scheduler is being used for training."
                )
        if self.lr_warmup_iters is None:
            self.lr_warmup_iters = scheduler.warmup_num_steps // self.megatron_lm_default_args["data_parallel_size"]
            if self.lr_warmup_samples is not None:
                warnings.warn(
                    "Ignoring `lr_warmup_samples` as `lr_warmup_iters` based on scheduler is being used for training."
                )
            self.lr_warmup_samples = 0

        self.megatron_lm_default_args["train_iters"] = self.train_iters
        self.megatron_lm_default_args["lr_warmup_iters"] = self.lr_warmup_iters
        self.megatron_lm_default_args["train_samples"] = self.train_samples
        self.megatron_lm_default_args["lr_warmup_samples"] = self.lr_warmup_samples
        self.megatron_lm_default_args["lr_decay_iters"] = self.lr_decay_iters
        self.megatron_lm_default_args["lr_decay_samples"] = self.lr_decay_samples
        self.megatron_lm_default_args["lr_warmup_fraction"] = self.lr_warmup_fraction
        self.megatron_lm_default_args["lr_decay_style"] = self.lr_decay_style
        self.megatron_lm_default_args["weight_decay_incr_style"] = self.weight_decay_incr_style
        self.megatron_lm_default_args["start_weight_decay"] = self.start_weight_decay
        self.megatron_lm_default_args["end_weight_decay"] = self.end_weight_decay
        self.megatron_lm_default_args["min_lr"] = self.min_lr

    def set_tensorboard_logging_options(self):
        from megatron.training.arguments import _add_logging_args

        parser = argparse.ArgumentParser()
        parser = _add_logging_args(parser)
        logging_args = parser.parse_known_args()
        self.dataset_args = vars(logging_args[0])
        for key, value in self.dataset_args.items():
            if key.startswith("log_"):
                self.megatron_lm_default_args[key] = True
            elif key.startswith("no_log_"):
                self.megatron_lm_default_args[key.replace("no_", "")] = True


MODEL_CONFIGS_TO_MEGATRON_PARSERS = {}


def add_model_config_to_megatron_parser(model_type: str):
    def add_model_config_parser_helper(func):
        @functools.wraps(func)
        def wrapper(*args, **kwargs):
            return func(*args, **kwargs)

        MODEL_CONFIGS_TO_MEGATRON_PARSERS[model_type] = func
        return wrapper

    return add_model_config_parser_helper


@add_model_config_to_megatron_parser("megatron-bert")
def parse_bert_config(megatron_lm_plugin, model, batch_data):
    model_type_name = "bert"
    num_layers = model.config.num_hidden_layers
    hidden_size = model.config.hidden_size
    num_attention_heads = model.config.num_attention_heads
    max_position_embeddings = model.config.max_position_embeddings
    num_labels = model.config.num_labels
    orig_vocab_size = model.config.vocab_size
    pretraining_flag = False
    if "maskedlm" in model.__class__.__name__.lower():
        pretraining_flag = True
    if megatron_lm_plugin.seq_length is not None:
        if megatron_lm_plugin.encoder_seq_length is not None:
            warnings.warn("Both `seq_length` and `encoder_seq_length` are set. Using `encoder_seq_length`.")
        megatron_lm_plugin.seq_length = megatron_lm_plugin.encoder_seq_length
    elif megatron_lm_plugin.encoder_seq_length is not None:
        megatron_lm_plugin.seq_length = megatron_lm_plugin.encoder_seq_length
    elif batch_data is not None:
        megatron_lm_plugin.seq_length = batch_data["input_ids"].shape[1]
    else:
        megatron_lm_plugin.seq_length = max_position_embeddings
    megatron_lm_plugin.megatron_lm_default_args["seq_length"] = megatron_lm_plugin.seq_length
    megatron_lm_plugin.megatron_lm_default_args["model_type_name"] = model_type_name
    megatron_lm_plugin.megatron_lm_default_args["num_layers"] = num_layers
    megatron_lm_plugin.megatron_lm_default_args["hidden_size"] = hidden_size
    megatron_lm_plugin.megatron_lm_default_args["num_attention_heads"] = num_attention_heads
    megatron_lm_plugin.megatron_lm_default_args["max_position_embeddings"] = max_position_embeddings
    megatron_lm_plugin.megatron_lm_default_args["pretraining_flag"] = pretraining_flag
    megatron_lm_plugin.megatron_lm_default_args["orig_vocab_size"] = orig_vocab_size
    megatron_lm_plugin.megatron_lm_default_args["model_return_dict"] = model.config.return_dict
    megatron_lm_plugin.megatron_lm_default_args["num_labels"] = num_labels


@add_model_config_to_megatron_parser("gpt2")
def parse_gpt2_config(megatron_lm_plugin, model, batch_data):
    model_type_name = "gpt"
    num_layers = model.config.n_layer
    hidden_size = model.config.n_embd
    num_attention_heads = model.config.n_head
    max_position_embeddings = model.config.n_positions
    orig_vocab_size = model.config.vocab_size
    pretraining_flag = True
    if megatron_lm_plugin.seq_length is not None:
        if megatron_lm_plugin.decoder_seq_length is not None:
            warnings.warn("Both `seq_length` and `decoder_seq_length` are set. Using `decoder_seq_length`.")
        megatron_lm_plugin.seq_length = megatron_lm_plugin.decoder_seq_length
    elif megatron_lm_plugin.decoder_seq_length is not None:
        megatron_lm_plugin.seq_length = megatron_lm_plugin.decoder_seq_length
    elif batch_data is not None:
        megatron_lm_plugin.seq_length = batch_data["input_ids"].shape[1]
    else:
        megatron_lm_plugin.seq_length = max_position_embeddings
    megatron_lm_plugin.megatron_lm_default_args["seq_length"] = megatron_lm_plugin.seq_length
    megatron_lm_plugin.megatron_lm_default_args["return_logits"] = megatron_lm_plugin.return_logits
    megatron_lm_plugin.megatron_lm_default_args["tokenizer_type"] = "GPT2BPETokenizer"
    megatron_lm_plugin.megatron_lm_default_args["model_type_name"] = model_type_name
    megatron_lm_plugin.megatron_lm_default_args["num_layers"] = num_layers
    megatron_lm_plugin.megatron_lm_default_args["hidden_size"] = hidden_size
    megatron_lm_plugin.megatron_lm_default_args["num_attention_heads"] = num_attention_heads
    megatron_lm_plugin.megatron_lm_default_args["max_position_embeddings"] = max_position_embeddings
    megatron_lm_plugin.megatron_lm_default_args["pretraining_flag"] = pretraining_flag
    megatron_lm_plugin.megatron_lm_default_args["orig_vocab_size"] = orig_vocab_size
    megatron_lm_plugin.megatron_lm_default_args["model_return_dict"] = model.config.return_dict


@add_model_config_to_megatron_parser("t5")
def parse_t5_config(megatron_lm_plugin, model, batch_data):
    model_type_name = "t5"
    num_layers = model.config.num_layers
    hidden_size = model.config.d_model
    num_attention_heads = model.config.num_heads
    max_position_embeddings = model.config.n_positions if hasattr(model.config, "n_positions") else 1024
    orig_vocab_size = model.config.vocab_size
    pretraining_flag = True
    if megatron_lm_plugin.encoder_seq_length is None:
        if batch_data is not None:
            megatron_lm_plugin.encoder_seq_length = batch_data["input_ids"].shape[1]
        else:
            megatron_lm_plugin.encoder_seq_length = max_position_embeddings
    if megatron_lm_plugin.decoder_seq_length is None:
        if batch_data is not None:
            megatron_lm_plugin.decoder_seq_length = batch_data["labels"].shape[1]
        else:
            megatron_lm_plugin.decoder_seq_length = max_position_embeddings
    megatron_lm_plugin.megatron_lm_default_args["encoder_seq_length"] = megatron_lm_plugin.encoder_seq_length
    megatron_lm_plugin.megatron_lm_default_args["decoder_seq_length"] = megatron_lm_plugin.decoder_seq_length
    megatron_lm_plugin.megatron_lm_default_args["model_type_name"] = model_type_name
    megatron_lm_plugin.megatron_lm_default_args["num_layers"] = num_layers
    megatron_lm_plugin.megatron_lm_default_args["hidden_size"] = hidden_size
    megatron_lm_plugin.megatron_lm_default_args["num_attention_heads"] = num_attention_heads
    megatron_lm_plugin.megatron_lm_default_args["max_position_embeddings"] = max_position_embeddings
    megatron_lm_plugin.megatron_lm_default_args["pretraining_flag"] = pretraining_flag
    megatron_lm_plugin.megatron_lm_default_args["orig_vocab_size"] = orig_vocab_size
    megatron_lm_plugin.megatron_lm_default_args["model_return_dict"] = model.config.return_dict


@add_model_config_to_megatron_parser("llama")
def parse_llama_config(megatron_lm_plugin, model, batch_data):
    model_type_name = "gpt"
    num_layers = model.config.num_hidden_layers
    pretraining_flag = True
    hidden_size = model.config.hidden_size
    num_attention_heads = model.config.num_attention_heads
    orig_vocab_size = model.config.vocab_size

    max_position_embeddings = model.config.max_position_embeddings
    seq_length = getattr(model.config, "max_sequence_length", None)
    if megatron_lm_plugin.seq_length is None:
        if seq_length is not None:
            megatron_lm_plugin.seq_length = seq_length
        elif megatron_lm_plugin.decoder_seq_length is not None:
            megatron_lm_plugin.seq_length = megatron_lm_plugin.decoder_seq_length
        elif batch_data is not None:
            megatron_lm_plugin.seq_length = batch_data["input_ids"].shape[1]
        else:
            megatron_lm_plugin.seq_length = max_position_embeddings

    megatron_lm_plugin.megatron_lm_default_args["return_logits"] = megatron_lm_plugin.return_logits
    megatron_lm_plugin.megatron_lm_default_args["tokenizer_type"] = "Llama2Tokenizer"
    megatron_lm_plugin.megatron_lm_default_args["model_type_name"] = model_type_name
    megatron_lm_plugin.megatron_lm_default_args["num_layers"] = num_layers
    megatron_lm_plugin.megatron_lm_default_args["pretraining_flag"] = pretraining_flag
    megatron_lm_plugin.megatron_lm_default_args["hidden_size"] = hidden_size
    megatron_lm_plugin.megatron_lm_default_args["num_attention_heads"] = num_attention_heads
    megatron_lm_plugin.megatron_lm_default_args["orig_vocab_size"] = orig_vocab_size
    megatron_lm_plugin.megatron_lm_default_args["max_position_embeddings"] = max_position_embeddings
    megatron_lm_plugin.megatron_lm_default_args["seq_length"] = megatron_lm_plugin.seq_length
    megatron_lm_plugin.megatron_lm_default_args["model_return_dict"] = model.config.return_dict


@add_model_config_to_megatron_parser("glm4_moe")
def parse_glm4_moe_config(megatron_lm_plugin, model, batch_data):
    model_type_name = "gpt"
    num_layers = model.config.num_hidden_layers
    pretraining_flag = False
    hidden_size = model.config.hidden_size
    num_attention_heads = model.config.num_attention_heads
    orig_vocab_size = model.config.vocab_size

    max_position_embeddings = model.config.max_position_embeddings
    seq_length = getattr(model.config, "max_sequence_length", None)
    if megatron_lm_plugin.seq_length is None:
        if seq_length is not None:
            megatron_lm_plugin.seq_length = seq_length
        elif megatron_lm_plugin.decoder_seq_length is not None:
            megatron_lm_plugin.seq_length = megatron_lm_plugin.decoder_seq_length
        elif batch_data is not None:
            megatron_lm_plugin.seq_length = batch_data["input_ids"].shape[1]
        else:
            megatron_lm_plugin.seq_length = max_position_embeddings

    megatron_lm_plugin.megatron_lm_default_args["return_logits"] = megatron_lm_plugin.return_logits
    megatron_lm_plugin.megatron_lm_default_args["tokenizer_type"] = "HuggingFaceTokenizer"
    megatron_lm_plugin.megatron_lm_default_args["model_type_name"] = model_type_name
    megatron_lm_plugin.megatron_lm_default_args["num_layers"] = num_layers
    megatron_lm_plugin.megatron_lm_default_args["pretraining_flag"] = pretraining_flag
    megatron_lm_plugin.megatron_lm_default_args["hidden_size"] = hidden_size
    megatron_lm_plugin.megatron_lm_default_args["num_attention_heads"] = num_attention_heads
    megatron_lm_plugin.megatron_lm_default_args["kv_channels"] = model.config.head_dim
    megatron_lm_plugin.megatron_lm_default_args["orig_vocab_size"] = orig_vocab_size
    megatron_lm_plugin.megatron_lm_default_args["max_position_embeddings"] = max_position_embeddings
    megatron_lm_plugin.megatron_lm_default_args["seq_length"] = megatron_lm_plugin.seq_length
    megatron_lm_plugin.megatron_lm_default_args["model_return_dict"] = model.config.return_dict
    megatron_lm_plugin.megatron_lm_default_args["position_embedding_type"] = "rope"
    megatron_lm_plugin.megatron_lm_default_args["original_model_type"] = model.config.model_type
    megatron_lm_plugin.megatron_lm_default_args["qk_layernorm"] = (
        model.config.use_qk_norm
    )  # this is true for glm4.5 but False for glm4.5-air.
    megatron_lm_plugin.megatron_lm_default_args["add_bias_linear"] = False
    megatron_lm_plugin.megatron_lm_default_args["group_query_attention"] = True
    megatron_lm_plugin.megatron_lm_default_args["num_query_groups"] = model.config.num_key_value_heads
    megatron_lm_plugin.megatron_lm_default_args["ffn_hidden_size"] = model.config.intermediate_size
    megatron_lm_plugin.megatron_lm_default_args["add_qkv_bias"] = True
    megatron_lm_plugin.megatron_lm_default_args["normalization"] = "RMSNorm"
    megatron_lm_plugin.megatron_lm_default_args["rotary-percent"] = 0.5
    megatron_lm_plugin.megatron_lm_default_args["swiglu"] = True
    megatron_lm_plugin.megatron_lm_default_args["moe_ffn_hidden_size"] = model.config.moe_intermediate_size
    megatron_lm_plugin.megatron_lm_default_args["moe_shared_expert_intermediate_size"] = (
        model.config.moe_intermediate_size
    )
    megatron_lm_plugin.megatron_lm_default_args["moe_router_pre_softmax"] = True
    megatron_lm_plugin.megatron_lm_default_args["moe_router_score_function"] = "sigmoid"
    megatron_lm_plugin.megatron_lm_default_args["moe_router_enable_expert_bias"] = True
    megatron_lm_plugin.megatron_lm_default_args["moe_router_bias_update_rate"] = 0
    megatron_lm_plugin.megatron_lm_default_args["moe_router_load_balancing_type"] = "seq_aux_loss"
    megatron_lm_plugin.megatron_lm_default_args["moe_token_dispatcher_type"] = "alltoall"
    megatron_lm_plugin.megatron_lm_default_args["moe_router_topk"] = model.config.num_experts_per_tok
    megatron_lm_plugin.megatron_lm_default_args["moe_router_topk_scaling_factor"] = model.config.routed_scaling_factor
    megatron_lm_plugin.megatron_lm_default_args["moe_layer_freq"] = [0] * model.config.first_k_dense_replace + [1] * (
        model.config.num_hidden_layers - model.config.first_k_dense_replace
    )
    megatron_lm_plugin.megatron_lm_default_args["num_experts"] = model.config.n_routed_experts
    megatron_lm_plugin.megatron_lm_default_args["moe_grouped_gemm"] = True
    megatron_lm_plugin.megatron_lm_default_args["moe_router_dtype"] = "fp32"
    megatron_lm_plugin.megatron_lm_default_args["moe_permute_fusion"] = True
    megatron_lm_plugin.megatron_lm_default_args["moe_aux_loss_coeff"] = 0
    megatron_lm_plugin.megatron_lm_default_args["rotary_base"] = model.config.rope_theta
    megatron_lm_plugin.megatron_lm_default_args["rope_type"] = "rope"
    megatron_lm_plugin.megatron_lm_default_args["rotary_percent"] = model.config.partial_rotary_factor
    megatron_lm_plugin.megatron_lm_default_args["norm_epsilon"] = 1e-3
    megatron_lm_plugin.megatron_lm_default_args["use_flash_attn"] = True
    megatron_lm_plugin.megatron_lm_default_args["eos_token_id"] = model.config.eos_token_id
    if getattr(model.config, "fp8_param", False):
        megatron_lm_plugin.megatron_lm_default_args["fp8"] = model.config.fp8
        megatron_lm_plugin.megatron_lm_default_args["fp8_param"] = model.config.fp8_param
        megatron_lm_plugin.megatron_lm_default_args["fp8_param_gather"] = model.config.fp8_param_gather
        megatron_lm_plugin.megatron_lm_default_args["fp8_recipe"] = model.config.fp8_recipe
    megatron_lm_plugin.megatron_lm_default_args["bf16"] = model.config.bf16
    megatron_lm_plugin.megatron_lm_default_args[
        "untie_embeddings_and_output_weights"
    ] = not model.config.tie_word_embeddings
    logger.info(f"Parsed GLM4 MoE config: {megatron_lm_plugin.megatron_lm_default_args}")


@dataclass
class BnbQuantizationConfig:
    """
    A plugin to enable BitsAndBytes 4bit and 8bit quantization

    Args:
        load_in_8bit (`bool`, defaults to `False`):
            Enable 8bit quantization.
        llm_int8_threshold (`float`, defaults to `6.0`):
            Value of the outliner threshold. Only relevant when `load_in_8bit=True`.
        load_in_4bit (`bool`, defaults to `False`):
            Enable 4bit quantization.
        bnb_4bit_quant_type (`str`, defaults to `fp4`):
            Set the quantization data type in the `bnb.nn.Linear4Bit` layers. Options are {'fp4','np4'}.
        bnb_4bit_use_double_quant (`bool`, defaults to `False`):
            Enable nested quantization where the quantization constants from the first quantization are quantized
            again.
        bnb_4bit_compute_dtype (`bool`, defaults to `fp16`):
            This sets the computational type which might be different than the input time. For example, inputs might be
            fp32, but computation can be set to bf16 for speedups. Options are {'fp32','fp16','bf16'}.
        torch_dtype (`torch.dtype`, defaults to `None`):
            This sets the dtype of the remaining non quantized layers. `bitsandbytes` library suggests to set the value
            to `torch.float16` for 8 bit model and use the same dtype as the compute dtype for 4 bit model.
        skip_modules (`List[str]`, defaults to `None`):
            An explicit list of the modules that we don't quantize. The dtype of these modules will be `torch_dtype`.
        keep_in_fp32_modules (`List`, defaults to `None`):
            An explicit list of the modules that we don't quantize. We keep them in `torch.float32`.
    """

    load_in_8bit: bool = field(default=False, metadata={"help": "enable 8bit quantization."})

    llm_int8_threshold: float = field(
        default=6.0,
        metadata={"help": "value of the outliner threshold. only relevant when load_in_8bit=True"},
    )

    load_in_4bit: bool = field(default=False, metadata={"help": "enable 4bit quantization."})

    bnb_4bit_quant_type: str = field(
        default="fp4",
        metadata={
            "help": "set the quantization data type in the `bnb.nn.Linear4Bit` layers. Options are {'fp4','nf4'}."
        },
    )

    bnb_4bit_use_double_quant: bool = field(
        default=False,
        metadata={
            "help": "enable nested quantization where the quantization constants from the first quantization are quantized again."
        },
    )

    bnb_4bit_compute_dtype: str = field(
        default="fp16",
        metadata={
            "help": "This sets the computational type which might be different than the input time. For example, inputs might be "
            "fp32, but computation can be set to bf16 for speedups. Options are {'fp32','fp16','bf16'}."
        },
    )

    torch_dtype: torch.dtype = field(
        default=None,
        metadata={
            "help": "this sets the dtype of the remaining non quantized layers. `bitsandbytes` library suggests to set the value"
            "to `torch.float16` for 8 bit model and use the same dtype as the compute dtype for 4 bit model "
        },
    )

    skip_modules: list[str] = field(
        default=None,
        metadata={
            "help": "an explicit list of the modules that we don't quantize. The dtype of these modules will be `torch_dtype`."
        },
    )

    keep_in_fp32_modules: list[str] = field(
        default=None,
        metadata={"help": "an explicit list of the modules that we don't quantize. We keep them in `torch.float32`."},
    )

    def __post_init__(self):
        """
        Safety checker that arguments are correct - also replaces some NoneType arguments with their default values.
        """
        if not isinstance(self.load_in_8bit, bool):
            raise ValueError("load_in_8bit must be a boolean")

        if not isinstance(self.load_in_4bit, bool):
            raise ValueError("load_in_4bit must be a boolean")

        if self.load_in_4bit and self.load_in_8bit:
            raise ValueError("load_in_4bit and load_in_8bit can't be both True")

        if not self.load_in_4bit and not self.load_in_8bit:
            raise ValueError("load_in_4bit and load_in_8bit can't be both False")

        if not isinstance(self.llm_int8_threshold, (int, float)):
            raise ValueError("llm_int8_threshold must be a float or an int")

        if not isinstance(self.bnb_4bit_quant_type, str):
            raise ValueError("bnb_4bit_quant_type must be a string")
        elif self.bnb_4bit_quant_type not in ["fp4", "nf4"]:
            raise ValueError(f"bnb_4bit_quant_type must be in ['fp4','nf4'] but found {self.bnb_4bit_quant_type}")

        if not isinstance(self.bnb_4bit_use_double_quant, bool):
            raise ValueError("bnb_4bit_use_double_quant must be a boolean")

        if isinstance(self.bnb_4bit_compute_dtype, str):
            if self.bnb_4bit_compute_dtype == "fp32":
                self.bnb_4bit_compute_dtype = torch.float32
            elif self.bnb_4bit_compute_dtype == "fp16":
                self.bnb_4bit_compute_dtype = torch.float16
            elif self.bnb_4bit_compute_dtype == "bf16":
                self.bnb_4bit_compute_dtype = torch.bfloat16
            else:
                raise ValueError(
                    f"bnb_4bit_compute_dtype must be in ['fp32','fp16','bf16'] but found {self.bnb_4bit_compute_dtype}"
                )
        elif not isinstance(self.bnb_4bit_compute_dtype, torch.dtype):
            raise ValueError("bnb_4bit_compute_dtype must be a string or a torch.dtype")

        if self.skip_modules is not None and not isinstance(self.skip_modules, list):
            raise ValueError("skip_modules must be a list of strings")

        if self.keep_in_fp32_modules is not None and not isinstance(self.keep_in_fp32_modules, list):
            raise ValueError("keep_in_fp_32_modules must be a list of strings")

        if self.load_in_4bit:
            self.target_dtype = CustomDtype.INT4

        if self.load_in_8bit:
            self.target_dtype = torch.int8

        if self.load_in_4bit and self.llm_int8_threshold != 6.0:
            warnings.warn("llm_int8_threshold can only be used for model loaded in 8bit")

        if isinstance(self.torch_dtype, str):
            if self.torch_dtype == "fp32":
                self.torch_dtype = torch.float32
            elif self.torch_dtype == "fp16":
                self.torch_dtype = torch.float16
            elif self.torch_dtype == "bf16":
                self.torch_dtype = torch.bfloat16
            else:
                raise ValueError(f"torch_dtype must be in ['fp32','fp16','bf16'] but found {self.torch_dtype}")
        if self.load_in_8bit and self.torch_dtype is None:
            self.torch_dtype = torch.float16

        if self.load_in_4bit and self.torch_dtype is None:
            self.torch_dtype = self.bnb_4bit_compute_dtype

        if not isinstance(self.torch_dtype, torch.dtype):
            raise ValueError("torch_dtype must be a torch.dtype")


def get_module_class_from_name(module, name):
    """
    Gets a class from a module by its name.

    Args:
        module (`torch.nn.Module`): The module to get the class from.
        name (`str`): The name of the class.
    """
    modules_children = list(module.children())
    if module.__class__.__name__ == name:
        return module.__class__
    elif len(modules_children) == 0:
        return
    else:
        for child_module in modules_children:
            module_class = get_module_class_from_name(child_module, name)
            if module_class is not None:
                return module_class


================================================
FILE: src/accelerate/utils/deepspeed.py
================================================
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import base64
import json
import os
from copy import deepcopy

from torch import optim

from ..optimizer import AcceleratedOptimizer
from ..scheduler import AcceleratedScheduler
from .dataclasses import DistributedType
from .imports import is_bnb_available
from .versions import compare_versions


def map_pytorch_optim_to_deepspeed(optimizer):
    """
    Args:
        optimizer: torch.optim.Optimizer

    Returns the DeepSeedCPUOptimizer (deepspeed.ops) version of the optimizer.
    """

    defaults = {k: v for k, v in optimizer.defaults.items() if k in ["lr", "weight_decay"]}

    # Select the DeepSpeedCPUOptimizer based on the original optimizer class.
    # DeepSpeedCPUAdam is the default
    from deepspeed.ops.adam import DeepSpeedCPUAdam

    optimizer_class = DeepSpeedCPUAdam

    # For DeepSpeedCPUAdam (adamw_mode)
    if compare_versions("deepspeed", ">=", "0.3.1"):
        defaults["adamw_mode"] = False
        is_adaw = isinstance(optimizer, optim.AdamW)

        if is_bnb_available() and not is_adaw:
            import bitsandbytes.optim as bnb_opt

            if isinstance(optimizer, (bnb_opt.AdamW, bnb_opt.AdamW32bit)):
                try:
                    is_adaw = optimizer.optim_bits == 32
                except AttributeError:
                    is_adaw = optimizer.args.optim_bits == 32
            else:
                is_adaw = False

        if is_adaw:
            defaults["adamw_mode"] = True

    # For DeepSpeedCPUAdagrad
    if compare_versions("deepspeed", ">=", "0.5.5"):
        # Check if the optimizer is PyTorch's Adagrad.
        is_ada = isinstance(optimizer, optim.Adagrad)
        # If not, and bitsandbytes is available,
        # # check if the optimizer is the 32-bit bitsandbytes Adagrad.
        if is_bnb_available() and not is_ada:
            import bitsandbytes.optim as bnb_opt

            if isinstance(optimizer, (bnb_opt.Adagrad, bnb_opt.Adagrad32bit)):
                try:
                    is_ada = optimizer.optim_bits == 32
                except AttributeError:
                    is_ada = optimizer.args.optim_bits == 32
        if is_ada:
            from deepspeed.ops.adagrad import DeepSpeedCPUAdagrad

            optimizer_class = DeepSpeedCPUAdagrad

    # For DeepSpeedCPULion
    if is_bnb_available(min_version="0.38.0") and compare_versions("deepspeed", ">=", "0.11.0"):
        from bitsandbytes.optim import Lion, Lion32bit

        if isinstance(optimizer, (Lion, Lion32bit)):
            try:
                is_bnb_32bits = optimizer.optim_bits == 32
            except AttributeError:
                is_bnb_32bits = optimizer.args.optim_bits == 32
            if is_bnb_32bits:
                from deepspeed.ops.lion import DeepSpeedCPULion

                optimizer_class = DeepSpeedCPULion

    return optimizer_class(optimizer.param_groups, **defaults)


def get_active_deepspeed_plugin(state):
    """
    Returns the currently active DeepSpeedPlugin.

    Raises:
        ValueError: If DeepSpeed was not enabled and this function is called.
    """
    if state.distributed_type != DistributedType.DEEPSPEED:
        raise ValueError(
            "Couldn't retrieve the active `DeepSpeedPlugin` as none were enabled. "
            "Please make sure that either `Accelerator` is configured for `deepspeed` "
            "or make sure that the desired `DeepSpeedPlugin` has been enabled (`AcceleratorState().select_deepspeed_plugin(name)`) "
            "before calling this function."
        )
    if not isinstance(state.deepspeed_plugins, dict):
        return state.deepspeed_plugins
    return next(plugin for plugin in state.deepspeed_plugins.values() if plugin.selected)


class HfDeepSpeedConfig:
    """
    This object contains a DeepSpeed configuration dictionary and can be quickly queried for things like zero stage.

    A `weakref` of this object is stored in the module's globals to be able to access the config from areas where
    things like the Trainer object is not available (e.g. `from_pretrained` and `_get_resized_embeddings`). Therefore
    it's important that this object remains alive while the program is still running.

    [`Trainer`] uses the `HfTrainerDeepSpeedConfig` subclass instead. That subclass has logic to sync the configuration
    with values of [`TrainingArguments`] by replacing special placeholder values: `"auto"`. Without this special logic
    the DeepSpeed configuration is not modified in any way.

    Args:
        config_file_or_dict (`Union[str, Dict]`): path to DeepSpeed config file or dict.

    """

    def __init__(self, config_file_or_dict):
        if isinstance(config_file_or_dict, dict):
            # Don't modify user's data should they want to reuse it (e.g. in tests), because once we
            # modified it, it will not be accepted here again, since `auto` values would have been overridden
            config = deepcopy(config_file_or_dict)
        elif os.path.exists(config_file_or_dict):
            with open(config_file_or_dict, encoding="utf-8") as f:
                config = json.load(f)
        else:
            try:
                try:
                    # First try parsing as JSON directly
                    config = json.loads(config_file_or_dict)
                except json.JSONDecodeError:
                    # If that fails, try base64 decoding
                    config_decoded = base64.urlsafe_b64decode(config_file_or_dict).decode("utf-8")
                    config = json.loads(config_decoded)
            except (UnicodeDecodeError, AttributeError, ValueError):
                raise ValueError(
                    f"Expected a string path to an existing deepspeed config, or a dictionary, or a base64 encoded string. Received: {config_file_or_dict}"
                )

        self.config = config

        self.set_stage_and_offload()

    def set_stage_and_offload(self):
        # zero stage - this is done as early as possible, before model is created, to allow
        # ``is_deepspeed_zero3_enabled`` query and getting to the early deepspeed config object
        # during ``zero.Init()`` which needs to know the dtype, and some other hparams.
        self._stage = self.get_value("zero_optimization.stage", -1)

        # offload
        self._offload = False
        if self.is_zero2() or self.is_zero3():
            offload_devices_valid = set(["cpu", "nvme"])
            offload_devices = set(
                [
                    self.get_value("zero_optimization.offload_optimizer.device"),
                    self.get_value("zero_optimization.offload_param.device"),
                ]
            )
            if len(offload_devices & offload_devices_valid) > 0:
                self._offload = True

    def find_config_node(self, ds_key_long):
        config = self.config

        # find the config node of interest if it exists
        nodes = ds_key_long.split(".")
        ds_key = nodes.pop()
        for node in nodes:
            config = config.get(node)
            if config is None:
                return None, ds_key

        return config, ds_key

    def get_value(self, ds_key_long, default=None):
        """
        Returns the set value or `default` if no value is set
        """
        config, ds_key = self.find_config_node(ds_key_long)
        if config is None:
            return default
        return config.get(ds_key, default)

    def del_config_sub_tree(self, ds_key_long, must_exist=False):
        """
        Deletes a sub-section of the config file if it's found.

        Unless `must_exist` is `True` the section doesn't have to exist.
        """
        config = self.config

        # find the config node of interest if it exists
        nodes = ds_key_long.split(".")
        for node in nodes:
            parent_config = config
            config = config.get(node)
            if config is None:
                if must_exist:
                    raise ValueError(f"Can't find {ds_key_long} entry in the config: {self.config}")
                else:
                    return

        # if found remove it
        if parent_config is not None:
            parent_config.pop(node)

    def is_true(self, ds_key_long):
        """
        Returns `True`/``False` only if the value is set, always `False` otherwise. So use this method to ask the very
        specific question of whether the value is set to `True` (and it's not set to `False`` or isn't set).

        """
        value = self.get_value(ds_key_long)
        return False if value is None else bool(value)

    def is_false(self, ds_key_long):
        """
        Returns `True`/``False` only if the value is set, always `False` otherwise. So use this method to ask the very
        specific question of whether the value is set to `False` (and it's not set to `True`` or isn't set).
        """
        value = self.get_value(ds_key_long)
        return False if value is None else not bool(value)

    def is_zero2(self):
        return self._stage == 2

    def is_zero3(self):
        return self._stage == 3

    def is_offload(self):
        return self._offload


class DeepSpeedEngineWrapper:
    """
    Internal wrapper for deepspeed.runtime.engine.DeepSpeedEngine. This is used to follow conventional training loop.

    Args:
        engine (deepspeed.runtime.engine.DeepSpeedEngine): deepspeed engine to wrap
    """

    def __init__(self, engine):
        self.engine = engine

    def backward(self, loss, sync_gradients=True, **kwargs):
        # Set gradient accumulation boundary based on Accelerate's sync_gradients state
        # This tells DeepSpeed whether this is the final micro-batch before gradient sync
        self.engine.set_gradient_accumulation_boundary(is_boundary=sync_gradients)

        # runs backpropagation and handles mixed precision
        self.engine.backward(loss, **kwargs)

        # Only perform step and related operations at gradient accumulation boundaries
        if sync_gradients:
            # Deepspeed's `engine.step` performs the following operations:
            # - gradient accumulation check
            # - gradient clipping
            # - optimizer step
            # - zero grad
            # - checking overflow
            # - lr_scheduler step (only if engine.lr_scheduler is not None)
            self.engine.step()
        # and this plugin overrides the above calls with no-ops when Accelerate runs under
        # Deepspeed, but allows normal functionality for non-Deepspeed cases thus enabling a simple
        # training loop that works transparently under many training regimes.

    def get_global_grad_norm(self):
        """Get the global gradient norm from DeepSpeed engine."""
        grad_norm = self.engine.get_global_grad_norm()
        # Convert to scalar if it's a tensor
        if hasattr(grad_norm, "item"):
            return grad_norm.item()
        return grad_norm


class DeepSpeedOptimizerWrapper(AcceleratedOptimizer):
    """
    Internal wrapper around a deepspeed optimizer.

    Args:
        optimizer (`torch.optim.optimizer.Optimizer`):
            The optimizer to wrap.
    """

    def __init__(self, optimizer):
        super().__init__(optimizer, device_placement=False, scaler=None)
        self.__has_overflow__ = hasattr(self.optimizer, "overflow")

    def zero_grad(self, set_to_none=None):
        pass  # `accelerator.backward(loss)` is doing that automatically. Therefore, its implementation is not needed

    def step(self):
        pass  # `accelerator.backward(loss)` is doing that automatically. Therefore, its implementation is not needed

    @property
    def step_was_skipped(self):
        """Whether or not the optimizer step was done, or skipped because of gradient overflow."""
        if self.__has_overflow__:
            return self.optimizer.overflow
        return False


class DeepSpeedSchedulerWrapper(AcceleratedScheduler):
    """
    Internal wrapper around a deepspeed scheduler.

    Args:
        scheduler (`torch.optim.lr_scheduler.LambdaLR`):
            The scheduler to wrap.
        optimizers (one or a list of `torch.optim.Optimizer`):
    """

    def __init__(self, scheduler, optimizers):
        super().__init__(scheduler, optimizers)

    def step(self):
        pass  # `accelerator.backward(loss)` is doing that automatically. Therefore, its implementation is not needed


class DummyOptim:
    """
    Dummy optimizer presents model parameters or param groups, this is primarily used to follow conventional training
    loop when optimizer config is specified in the deepspeed config file.

    Args:
        lr (float):
            Learning rate.
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        weight_decay (float):
            Weight decay.
        **kwargs (additional keyword arguments, *optional*):
            Other arguments.
    """

    def __init__(self, params, lr=0.001, weight_decay=0, **kwargs):
        self.params = params
        self.lr = lr
        self.weight_decay = weight_decay
        self.kwargs = kwargs


class DummyScheduler:
    """
    Dummy scheduler presents model parameters or param groups, this is primarily used to follow conventional training
    loop when scheduler config is specified in the deepspeed config file.

    Args:
        optimizer (`torch.optim.optimizer.Optimizer`):
            The optimizer to wrap.
        total_num_steps (int, *optional*):
            Total number of steps.
        warmup_num_steps (int, *optional*):
            Number of steps for warmup.
        lr_scheduler_callable (callable, *optional*):
            A callable function that creates an LR Scheduler. It accepts only one argument `optimizer`.
        **kwargs (additional keyword arguments, *optional*):
            Other arguments.
    """

    def __init__(self, optimizer, total_num_steps=None, warmup_num_steps=0, lr_scheduler_callable=None, **kwargs):
        self.optimizer = optimizer
        self.total_num_steps = total_num_steps
        self.warmup_num_steps = warmup_num_steps
        self.lr_scheduler_callable = lr_scheduler_callable
        self.kwargs = kwargs


================================================
FILE: src/accelerate/utils/environment.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import math
import os
import platform
import subprocess
import sys
from contextlib import contextmanager
from dataclasses import dataclass, field
from functools import lru_cache, wraps
from shutil import which
from typing import Optional, Union

import torch
from packaging.version import parse


logger = logging.getLogger(__name__)


def convert_dict_to_env_variables(current_env: dict):
    """
    Verifies that all keys and values in `current_env` do not contain illegal keys or values, and returns a list of
    strings as the result.

    Example:
    ```python
    >>> from accelerate.utils.environment import verify_env

    >>> env = {"ACCELERATE_DEBUG_MODE": "1", "BAD_ENV_NAME": "<mything", "OTHER_ENV": "2"}
    >>> valid_env_items = verify_env(env)
    >>> print(valid_env_items)
    ["ACCELERATE_DEBUG_MODE=1\n", "OTHER_ENV=2\n"]
    ```
    """
    forbidden_chars = [";", "\n", "<", ">", " "]
    valid_env_items = []
    for key, value in current_env.items():
        if all(char not in (key + value) for char in forbidden_chars) and len(key) >= 1 and len(value) >= 1:
            valid_env_items.append(f"{key}={value}\n")
        else:
            logger.warning(f"WARNING: Skipping {key}={value} as it contains forbidden characters or missing values.")
    return valid_env_items


def str_to_bool(value, to_bool: bool = False) -> Union[int, bool]:
    """
    Converts a string representation of truth to `True` (1) or `False` (0).

    True values are `y`, `yes`, `t`, `true`, `on`, and `1`; False value are `n`, `no`, `f`, `false`, `off`, and `0`;
    """
    value = value.lower()
    if value in ("y", "yes", "t", "true", "on", "1"):
        return 1 if not to_bool else True
    elif value in ("n", "no", "f", "false", "off", "0"):
        return 0 if not to_bool else False
    else:
        raise ValueError(f"invalid truth value {value}")


def get_int_from_env(env_keys, default):
    """Returns the first positive env value found in the `env_keys` list or the default."""
    for e in env_keys:
        val = int(os.environ.get(e, -1))
        if val >= 0:
            return val
    return default


def parse_flag_from_env(key, default=False):
    """Returns truthy value for `key` from the env if available else the default."""
    value = os.environ.get(key, str(default))
    return str_to_bool(value) == 1  # As its name indicates `str_to_bool` actually returns an int...


def parse_choice_from_env(key, default="no"):
    value = os.environ.get(key, str(default))
    return value


def are_libraries_initialized(*library_names: str) -> list[str]:
    """
    Checks if any of `library_names` are imported in the environment. Will return any names that are.
    """
    return [lib_name for lib_name in library_names if lib_name in sys.modules.keys()]


def get_current_device_type() -> tuple[str, str]:
    """
    Determines the current device type and distributed type without initializing any device.

    This is particularly important when using fork-based multiprocessing, as device initialization
    before forking can cause errors.

    The device detection order follows the same priority as state.py:_prepare_backend():
    MLU -> SDAA -> MUSA -> NPU -> HPU -> CUDA -> XPU

    Returns:
        tuple[str, str]: A tuple of (device_type, distributed_type)
            - device_type: The device string (e.g., "cuda", "npu", "xpu")
            - distributed_type: The distributed type string (e.g., "MULTI_GPU", "MULTI_NPU")

    Example:
        ```python
        >>> device_type, distributed_type = get_current_device_type()
        >>> print(device_type)  # "cuda"
        >>> print(distributed_type)  # "MULTI_GPU"
        ```
    """
    from .imports import (
        is_hpu_available,
        is_mlu_available,
        is_musa_available,
        is_neuron_available,
        is_npu_available,
        is_sdaa_available,
        is_xpu_available,
    )

    if is_mlu_available():
        return "mlu", "MULTI_MLU"
    elif is_sdaa_available():
        return "sdaa", "MULTI_SDAA"
    elif is_musa_available():
        return "musa", "MULTI_MUSA"
    elif is_npu_available():
        return "npu", "MULTI_NPU"
    elif is_hpu_available():
        return "hpu", "MULTI_HPU"
    elif torch.cuda.is_available():
        return "cuda", "MULTI_GPU"
    elif is_xpu_available():
        return "xpu", "MULTI_XPU"
    elif is_neuron_available():
        return "neuron", "MULTI_NEURON"
    else:
        # Default to CUDA even if not available (for CPU-only scenarios where CUDA code paths are still used)
        return "cuda", "MULTI_GPU"


def _nvidia_smi():
    """
    Returns the right nvidia-smi command based on the system.
    """
    if platform.system() == "Windows":
        # If platform is Windows and nvidia-smi can't be found in path
        # try from systemd drive with default installation path
        command = which("nvidia-smi")
        if command is None:
            command = f"{os.environ['systemdrive']}\\Program Files\\NVIDIA Corporation\\NVSMI\\nvidia-smi.exe"
    else:
        command = "nvidia-smi"
    return command


def get_gpu_info():
    """
    Gets GPU count and names using `nvidia-smi` instead of torch to not initialize CUDA.

    Largely based on the `gputil` library.
    """
    # Returns as list of `n` GPUs and their names
    output = subprocess.check_output(
        [_nvidia_smi(), "--query-gpu=count,name", "--format=csv,noheader"], universal_newlines=True
    )
    output = output.strip()
    gpus = output.split(os.linesep)
    # Get names from output
    gpu_count = len(gpus)
    gpu_names = [gpu.split(",")[1].strip() for gpu in gpus]
    return gpu_names, gpu_count


def get_driver_version():
    """
    Returns the driver version

    In the case of multiple GPUs, will return the first.
    """
    output = subprocess.check_output(
        [_nvidia_smi(), "--query-gpu=driver_version", "--format=csv,noheader"], universal_newlines=True
    )
    output = output.strip()
    return output.split(os.linesep)[0]


def check_cuda_p2p_ib_support():
    """
    Checks if the devices being used have issues with P2P and IB communications, namely any consumer GPU hardware after
    the 3090.

    Notably uses `nvidia-smi` instead of torch to not initialize CUDA.
    """
    try:
        device_names, device_count = get_gpu_info()
        # As new consumer GPUs get released, add them to `unsupported_devices``
        unsupported_devices = {"RTX 40"}
        if device_count > 1:
            if any(
                unsupported_device in device_name
                for device_name in device_names
                for unsupported_device in unsupported_devices
            ):
                # Check if they have the right driver version
                acceptable_driver_version = "550.40.07"
                current_driver_version = get_driver_version()
                if parse(current_driver_version) < parse(acceptable_driver_version):
                    return False
                return True
    except Exception:
        pass
    return True


@lru_cache
def check_cuda_fp8_capability():
    """
    Checks if the current GPU available supports FP8.

    Notably might initialize `torch.cuda` to check.
    """

    try:
        # try to get the compute capability from nvidia-smi
        output = subprocess.check_output(
            [_nvidia_smi(), "--query-gpu=compute_capability", "--format=csv,noheader"], universal_newlines=True
        )
        output = output.strip()
        # we take the first GPU's compute capability
        compute_capability = tuple(map(int, output.split(os.linesep)[0].split(".")))
    except Exception:
        compute_capability = torch.cuda.get_device_capability()

    return compute_capability >= (8, 9)


@dataclass
class CPUInformation:
    """
    Stores information about the CPU in a distributed environment. It contains the following attributes:
    - rank: The rank of the current process.
    - world_size: The total number of processes in the world.
    - local_rank: The rank of the current process on the local node.
    - local_world_size: The total number of processes on the local node.
    """

    rank: int = field(default=0, metadata={"help": "The rank of the current process."})
    world_size: int = field(default=1, metadata={"help": "The total number of processes in the world."})
    local_rank: int = field(default=0, metadata={"help": "The rank of the current process on the local node."})
    local_world_size: int = field(default=1, metadata={"help": "The total number of processes on the local node."})


def get_cpu_distributed_information() -> CPUInformation:
    """
    Returns various information about the environment in relation to CPU distributed training as a `CPUInformation`
    dataclass.
    """
    information = {}
    information["rank"] = get_int_from_env(["RANK", "PMI_RANK", "OMPI_COMM_WORLD_RANK", "MV2_COMM_WORLD_RANK"], 0)
    information["world_size"] = get_int_from_env(
        ["WORLD_SIZE", "PMI_SIZE", "OMPI_COMM_WORLD_SIZE", "MV2_COMM_WORLD_SIZE"], 1
    )
    information["local_rank"] = get_int_from_env(
        ["LOCAL_RANK", "MPI_LOCALRANKID", "OMPI_COMM_WORLD_LOCAL_RANK", "MV2_COMM_WORLD_LOCAL_RANK"], 0
    )
    information["local_world_size"] = get_int_from_env(
        ["LOCAL_WORLD_SIZE", "MPI_LOCALNRANKS", "OMPI_COMM_WORLD_LOCAL_SIZE", "MV2_COMM_WORLD_LOCAL_SIZE"],
        1,
    )
    return CPUInformation(**information)


def override_numa_affinity(local_process_index: int, verbose: Optional[bool] = None) -> None:
    """
    Overrides whatever NUMA affinity is set for the current process. This is very taxing and requires recalculating the
    affinity to set, ideally you should use `utils.environment.set_numa_affinity` instead.

    Args:
        local_process_index (int):
            The index of the current process on the current server.
        verbose (bool, *optional*):
            Whether to log out the assignment of each CPU. If `ACCELERATE_DEBUG_MODE` is enabled, will default to True.
    """
    if verbose is None:
        verbose = parse_flag_from_env("ACCELERATE_DEBUG_MODE", False)
    if torch.cuda.is_available():
        from accelerate.utils import is_pynvml_available

        if not is_pynvml_available():
            raise ImportError(
                "To set CPU affinity on CUDA GPUs the `nvidia-ml-py` package must be available. (`pip install nvidia-ml-py`)"
            )
        import pynvml as nvml

        # The below code is based on https://github.com/NVIDIA/DeepLearningExamples/blob/master/TensorFlow2/LanguageModeling/BERT/gpu_affinity.py
        nvml.nvmlInit()
        num_elements = math.ceil(os.cpu_count() / 64)
        handle = nvml.nvmlDeviceGetHandleByIndex(local_process_index)
        affinity_string = ""
        for j in nvml.nvmlDeviceGetCpuAffinity(handle, num_elements):
            # assume nvml returns list of 64 bit ints
            affinity_string = f"{j:064b}{affinity_string}"
        affinity_list = [int(x) for x in affinity_string]
        affinity_list.reverse()  # so core 0 is the 0th element
        affinity_to_set = [i for i, e in enumerate(affinity_list) if e != 0]
        os.sched_setaffinity(0, affinity_to_set)
        if verbose:
            cpu_cores = os.sched_getaffinity(0)
            logger.info(f"Assigning {len(cpu_cores)} cpu cores to process {local_process_index}: {cpu_cores}")


@lru_cache
def set_numa_affinity(local_process_index: int, verbose: Optional[bool] = None) -> None:
    """
    Assigns the current process to a specific NUMA node. Ideally most efficient when having at least 2 cpus per node.

    This result is cached between calls. If you want to override it, please use
    `accelerate.utils.environment.override_numa_afifnity`.

    Args:
        local_process_index (int):
            The index of the current process on the current server.
        verbose (bool, *optional*):
            Whether to print the new cpu cores assignment for each process. If `ACCELERATE_DEBUG_MODE` is enabled, will
            default to True.
    """
    override_numa_affinity(local_process_index=local_process_index, verbose=verbose)


@contextmanager
def clear_environment():
    """
    A context manager that will temporarily clear environment variables.

    When this context exits, the previous environment variables will be back.

    Example:

    ```python
    >>> import os
    >>> from accelerate.utils import clear_environment

    >>> os.environ["FOO"] = "bar"
    >>> with clear_environment():
    ...     print(os.environ)
    ...     os.environ["FOO"] = "new_bar"
    ...     print(os.environ["FOO"])
    {}
    new_bar

    >>> print(os.environ["FOO"])
    bar
    ```
    """
    _old_os_environ = os.environ.copy()
    os.environ.clear()

    try:
        yield
    finally:
        os.environ.clear()  # clear any added keys,
        os.environ.update(_old_os_environ)  # then restore previous environment


@contextmanager
def patch_environment(**kwargs):
    """
    A context manager that will add each keyword argument passed to `os.environ` and remove them when exiting.

    Will convert the values in `kwargs` to strings and upper-case all the keys.

    Example:

    ```python
    >>> import os
    >>> from accelerate.utils import patch_environment

    >>> with patch_environment(FOO="bar"):
    ...     print(os.environ["FOO"])  # prints "bar"
    >>> print(os.environ["FOO"])  # raises KeyError
    ```
    """
    existing_vars = {}
    for key, value in kwargs.items():
        key = key.upper()
        if key in os.environ:
            existing_vars[key] = os.environ[key]
        os.environ[key] = str(value)

    try:
        yield
    finally:
        for key in kwargs:
            key = key.upper()
            if key in existing_vars:
                # restore previous value
                os.environ[key] = existing_vars[key]
            else:
                os.environ.pop(key, None)


def purge_accelerate_environment(func_or_cls):
    """Decorator to clean up accelerate environment variables set by the decorated class or function.

    In some circumstances, calling certain classes or functions can result in accelerate env vars being set and not
    being cleaned up afterwards. As an example, when calling:

    TrainingArguments(fp16=True, ...)

    The following env var will be set:

    ACCELERATE_MIXED_PRECISION=fp16

    This can affect subsequent code, since the env var takes precedence over TrainingArguments(fp16=False). This is
    especially relevant for unit testing, where we want to avoid the individual tests to have side effects on one
    another. Decorate the unit test function or whole class with this decorator to ensure that after each test, the env
    vars are cleaned up. This works for both unittest.TestCase and normal classes (pytest); it also works when
    decorating the parent class.

    """
    prefix = "ACCELERATE_"

    @contextmanager
    def env_var_context():
        # Store existing accelerate env vars
        existing_vars = {k: v for k, v in os.environ.items() if k.startswith(prefix)}
        try:
            yield
        finally:
            # Restore original env vars or remove new ones
            for key in [k for k in os.environ if k.startswith(prefix)]:
                if key in existing_vars:
                    os.environ[key] = existing_vars[key]
                else:
                    os.environ.pop(key, None)

    def wrap_function(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            with env_var_context():
                return func(*args, **kwargs)

        wrapper._accelerate_is_purged_environment_wrapped = True
        return wrapper

    if not isinstance(func_or_cls, type):
        return wrap_function(func_or_cls)

    # Handle classes by wrapping test methods
    def wrap_test_methods(test_class_instance):
        for name in dir(test_class_instance):
            if name.startswith("test"):
                method = getattr(test_class_instance, name)
                if callable(method) and not hasattr(method, "_accelerate_is_purged_environment_wrapped"):
                    setattr(test_class_instance, name, wrap_function(method))
        return test_class_instance

    # Handle inheritance
    wrap_test_methods(func_or_cls)
    func_or_cls.__init_subclass__ = classmethod(lambda cls, **kw: wrap_test_methods(cls))
    return func_or_cls


================================================
FILE: src/accelerate/utils/fsdp_utils.py
================================================
# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import functools
import os
import re
import shutil
import warnings
from collections import defaultdict
from collections.abc import Iterable
from contextlib import nullcontext
from pathlib import Path
from typing import Callable, Union

import torch

from ..logging import get_logger
from .constants import FSDP_MODEL_NAME, OPTIMIZER_NAME, SAFE_WEIGHTS_NAME, WEIGHTS_NAME
from .dataclasses import get_module_class_from_name
from .modeling import get_non_persistent_buffers, is_peft_model
from .other import get_module_children_bottom_up, is_compiled_module, save
from .versions import is_torch_version


logger = get_logger(__name__)


def enable_fsdp_ram_efficient_loading():
    """
    Enables RAM efficient loading of Hugging Face models for FSDP in the environment.
    """
    # Sets values for `transformers.modeling_utils.is_fsdp_enabled`
    if "ACCELERATE_USE_FSDP" not in os.environ:
        os.environ["ACCELERATE_USE_FSDP"] = "True"
    os.environ["FSDP_CPU_RAM_EFFICIENT_LOADING"] = "True"


def disable_fsdp_ram_efficient_loading():
    """
    Disables RAM efficient loading of Hugging Face models for FSDP in the environment.
    """
    os.environ["FSDP_CPU_RAM_EFFICIENT_LOADING"] = "False"


def _get_model_state_dict(model, adapter_only=False, sd_options=None):
    if adapter_only and is_peft_model(model):
        from peft import get_peft_model_state_dict

        return get_peft_model_state_dict(model, adapter_name=model.active_adapter)

    # Invariant: `sd_options` is not None only for FSDP2
    if sd_options is not None:
        from torch.distributed.checkpoint.state_dict import get_model_state_dict

        return get_model_state_dict(model, options=sd_options)
    else:
        return model.state_dict()


def _set_model_state_dict(model, state_dict, adapter_only=False, sd_options=None):
    if adapter_only and is_peft_model(model):
        from peft import set_peft_model_state_dict

        return set_peft_model_state_dict(model, state_dict, adapter_name=model.active_adapter)

    # Invariant: `sd_options` is not None only for FSDP2
    if sd_options is not None:
        from torch.distributed.checkpoint.state_dict import set_model_state_dict

        return set_model_state_dict(model, state_dict, options=sd_options)
    else:
        return model.load_state_dict(state_dict)


def _prepare_sd_options(fsdp_plugin):
    sd_options = None

    # we use this only for FSDP2, as it requires torch >= 2.6.0 and this api requires torch >= 2.2.0
    if fsdp_plugin.fsdp_version == 2:
        from torch.distributed.checkpoint.state_dict import StateDictOptions
        from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType

        sd_options = StateDictOptions(
            full_state_dict=fsdp_plugin.state_dict_type == StateDictType.FULL_STATE_DICT,
            cpu_offload=getattr(fsdp_plugin.state_dict_config, "offload_to_cpu", False),
            broadcast_from_rank0=getattr(fsdp_plugin.state_dict_config, "rank0_only", False),
        )

    return sd_options


def save_fsdp_model(fsdp_plugin, accelerator, model, output_dir, model_index=0, adapter_only=False):
    # Note: We import here to reduce import time from general modules, and isolate outside dependencies
    import torch.distributed.checkpoint as dist_cp
    from torch.distributed.checkpoint.default_planner import DefaultSavePlanner
    from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
    from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType

    os.makedirs(output_dir, exist_ok=True)
    if fsdp_plugin.state_dict_type == StateDictType.FULL_STATE_DICT:
        # FSDP raises error when single GPU is used with `offload_to_cpu=True` for FULL_STATE_DICT
        # so, only enable it when num_processes>1
        is_multi_process = accelerator.num_processes > 1
        fsdp_plugin.state_dict_config.offload_to_cpu = is_multi_process
        fsdp_plugin.state_dict_config.rank0_only = is_multi_process

    ctx = (
        FSDP.state_dict_type(
            model, fsdp_plugin.state_dict_type, fsdp_plugin.state_dict_config, fsdp_plugin.optim_state_dict_config
        )
        if fsdp_plugin.fsdp_version == 1
        else nullcontext()
    )
    sd_options = _prepare_sd_options(fsdp_plugin)

    with ctx:
        state_dict = _get_model_state_dict(model, adapter_only=adapter_only, sd_options=sd_options)
        if fsdp_plugin.state_dict_type == StateDictType.FULL_STATE_DICT:
            weights_name = f"{FSDP_MODEL_NAME}.bin" if model_index == 0 else f"{FSDP_MODEL_NAME}_{model_index}.bin"
            output_model_file = os.path.join(output_dir, weights_name)
            if accelerator.process_index == 0:
                logger.info(f"Saving model to {output_model_file}")
                torch.save(state_dict, output_model_file)
                logger.info(f"Model saved to {output_model_file}")
        # Invariant: `LOCAL_STATE_DICT` is never possible with `FSDP2`
        elif fsdp_plugin.state_dict_type == StateDictType.LOCAL_STATE_DICT:
            weights_name = (
                f"{FSDP_MODEL_NAME}_rank{accelerator.process_index}.bin"
                if model_index == 0
                else f"{FSDP_MODEL_NAME}_{model_index}_rank{accelerator.process_index}.bin"
            )
            output_model_file = os.path.join(output_dir, weights_name)
            logger.info(f"Saving model to {output_model_file}")
            torch.save(state_dict, output_model_file)
            logger.info(f"Model saved to {output_model_file}")
        elif fsdp_plugin.state_dict_type == StateDictType.SHARDED_STATE_DICT:
            ckpt_dir = os.path.join(output_dir, f"{FSDP_MODEL_NAME}_{model_index}")
            os.makedirs(ckpt_dir, exist_ok=True)
            logger.info(f"Saving model to {ckpt_dir}")
            state_dict = {"model": state_dict}

            dist_cp.save(
                state_dict=state_dict,
                storage_writer=dist_cp.FileSystemWriter(ckpt_dir),
                planner=DefaultSavePlanner(),
            )
            logger.info(f"Model saved to {ckpt_dir}")


def load_fsdp_model(fsdp_plugin, accelerator, model, input_dir, model_index=0, adapter_only=False):
    # Note: We import here to reduce import time from general modules, and isolate outside dependencies
    import torch.distributed.checkpoint as dist_cp
    from torch.distributed.checkpoint.default_planner import DefaultLoadPlanner
    from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
    from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType

    accelerator.wait_for_everyone()
    if fsdp_plugin.state_dict_type == StateDictType.FULL_STATE_DICT:
        # FSDP raises error when single GPU is used with `offload_to_cpu=True` for FULL_STATE_DICT
        # so, only enable it when num_processes>1
        is_multi_process = accelerator.num_processes > 1
        fsdp_plugin.state_dict_config.offload_to_cpu = is_multi_process
        fsdp_plugin.state_dict_config.rank0_only = is_multi_process

    ctx = (
        FSDP.state_dict_type(
            model, fsdp_plugin.state_dict_type, fsdp_plugin.state_dict_config, fsdp_plugin.optim_state_dict_config
        )
        if fsdp_plugin.fsdp_version == 1
        else nullcontext()
    )
    sd_options = _prepare_sd_options(fsdp_plugin)
    with ctx:
        if fsdp_plugin.state_dict_type == StateDictType.FULL_STATE_DICT:
            if type(model) is not FSDP and accelerator.process_index != 0 and not accelerator.is_fsdp2:
                if not fsdp_plugin.sync_module_states and fsdp_plugin.fsdp_version == 1:
                    raise ValueError(
                        "Set the `sync_module_states` flag to `True` so that model states are synced across processes when "
                        "initializing FSDP object"
                    )
                return
            weights_name = f"{FSDP_MODEL_NAME}.bin" if model_index == 0 else f"{FSDP_MODEL_NAME}_{model_index}.bin"
            input_model_file = os.path.join(input_dir, weights_name)
            logger.info(f"Loading model from {input_model_file}")
            # we want an empty state dict for FSDP2 as we use `broadcast_from_rank0`
            load_model = not accelerator.is_fsdp2 or accelerator.is_main_process
            if load_model:
                state_dict = torch.load(input_model_file, weights_only=True)
            else:
                state_dict = {}
            logger.info(f"Model loaded from {input_model_file}")
        elif fsdp_plugin.state_dict_type == StateDictType.LOCAL_STATE_DICT:
            weights_name = (
                f"{FSDP_MODEL_NAME}_rank{accelerator.process_index}.bin"
                if model_index == 0
                else f"{FSDP_MODEL_NAME}_{model_index}_rank{accelerator.process_index}.bin"
            )
            input_model_file = os.path.join(input_dir, weights_name)
            logger.info(f"Loading model from {input_model_file}")
            state_dict = torch.load(input_model_file, weights_only=True)
            logger.info(f"Model loaded from {input_model_file}")
        elif fsdp_plugin.state_dict_type == StateDictType.SHARDED_STATE_DICT:
            ckpt_dir = (
                os.path.join(input_dir, f"{FSDP_MODEL_NAME}_{model_index}")
                if f"{FSDP_MODEL_NAME}" not in input_dir
                else input_dir
            )
            logger.info(f"Loading model from {ckpt_dir}")
            state_dict = {"model": _get_model_state_dict(model, adapter_only=adapter_only, sd_options=sd_options)}
            dist_cp.load(
                state_dict=state_dict,
                storage_reader=dist_cp.FileSystemReader(ckpt_dir),
                planner=DefaultLoadPlanner(),
            )
            state_dict = state_dict["model"]
            logger.info(f"Model loaded from {ckpt_dir}")

        load_result = _set_model_state_dict(model, state_dict, adapter_only=adapter_only, sd_options=sd_options)
    return load_result


def save_fsdp_optimizer(fsdp_plugin, accelerator, optimizer, model, output_dir, optimizer_index=0):
    # Note: We import here to reduce import time from general modules, and isolate outside dependencies
    import torch.distributed.checkpoint as dist_cp
    from torch.distributed.checkpoint.default_planner import DefaultSavePlanner
    from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
    from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType

    os.makedirs(output_dir, exist_ok=True)

    ctx = (
        FSDP.state_dict_type(
            model, fsdp_plugin.state_dict_type, fsdp_plugin.state_dict_config, fsdp_plugin.optim_state_dict_config
        )
        if fsdp_plugin.fsdp_version == 1
        else nullcontext()
    )

    sd_options = _prepare_sd_options(fsdp_plugin)

    with ctx:
        if fsdp_plugin.fsdp_version == 2:
            from torch.distributed.checkpoint.state_dict import get_optimizer_state_dict

            optim_state = get_optimizer_state_dict(model, optimizer, options=sd_options)
        else:
            optim_state = FSDP.optim_state_dict(model, optimizer)

        if fsdp_plugin.state_dict_type == StateDictType.FULL_STATE_DICT:
            if accelerator.process_index == 0:
                optim_state_name = (
                    f"{OPTIMIZER_NAME}.bin" if optimizer_index == 0 else f"{OPTIMIZER_NAME}_{optimizer_index}.bin"
                )
                output_optimizer_file = os.path.join(output_dir, optim_state_name)
                logger.info(f"Saving Optimizer state to {output_optimizer_file}")
                torch.save(optim_state, output_optimizer_file)
                logger.info(f"Optimizer state saved in {output_optimizer_file}")
        else:
            ckpt_dir = os.path.join(output_dir, f"{OPTIMIZER_NAME}_{optimizer_index}")
            os.makedirs(ckpt_dir, exist_ok=True)
            logger.info(f"Saving Optimizer state to {ckpt_dir}")
            dist_cp.save(
                state_dict={"optimizer": optim_state},
                storage_writer=dist_cp.FileSystemWriter(ckpt_dir),
                planner=DefaultSavePlanner(),
            )
            logger.info(f"Optimizer state saved in {ckpt_dir}")


def load_fsdp_optimizer(fsdp_plugin, accelerator, optimizer, model, input_dir, optimizer_index=0, adapter_only=False):
    # Note: We import here to reduce import time from general modules, and isolate outside dependencies
    import torch.distributed.checkpoint as dist_cp
    from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
    from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType

    accelerator.wait_for_everyone()
    ctx = (
        FSDP.state_dict_type(
            model, fsdp_plugin.state_dict_type, fsdp_plugin.state_dict_config, fsdp_plugin.optim_state_dict_config
        )
        if fsdp_plugin.fsdp_version == 1
        else nullcontext()
    )
    sd_options = _prepare_sd_options(fsdp_plugin)
    with ctx:
        if fsdp_plugin.state_dict_type == StateDictType.FULL_STATE_DICT:
            optim_state = None
            if accelerator.process_index == 0 or not fsdp_plugin.optim_state_dict_config.rank0_only:
                optimizer_name = (
                    f"{OPTIMIZER_NAME}.bin" if optimizer_index == 0 else f"{OPTIMIZER_NAME}_{optimizer_index}.bin"
                )
                input_optimizer_file = os.path.join(input_dir, optimizer_name)
                logger.info(f"Loading Optimizer state from {input_optimizer_file}")
                optim_state = torch.load(input_optimizer_file, weights_only=True)
                logger.info(f"Optimizer state loaded from {input_optimizer_file}")
        else:
            ckpt_dir = (
                os.path.join(input_dir, f"{OPTIMIZER_NAME}_{optimizer_index}")
                if f"{OPTIMIZER_NAME}" not in input_dir
                else input_dir
            )
            logger.info(f"Loading Optimizer from {ckpt_dir}")
            if fsdp_plugin.fsdp_version == 2:
                from torch.distributed.checkpoint.state_dict import get_optimizer_state_dict

                optim_state = get_optimizer_state_dict(model, optimizer, options=sd_options)
            else:
                optim_state = FSDP.optim_state_dict(model, optimizer)
            optim_state = {"optimizer": optim_state}
            dist_cp.load(
                optim_state,
                checkpoint_id=ckpt_dir,
                storage_reader=dist_cp.FileSystemReader(ckpt_dir),
            )
            optim_state = optim_state["optimizer"]
            logger.info(f"Optimizer loaded from {ckpt_dir}")

        if fsdp_plugin.fsdp_version == 1:
            flattened_osd = FSDP.optim_state_dict_to_load(model=model, optim=optimizer, optim_state_dict=optim_state)
            optimizer.load_state_dict(flattened_osd)
        else:
            from torch.distributed.checkpoint.state_dict import set_optimizer_state_dict

            set_optimizer_state_dict(model, optimizer, optim_state, options=sd_options)


def _distributed_checkpoint_to_merged_weights(checkpoint_dir: str, save_path: str, safe_serialization: bool = True):
    """
    Passthrough to `torch.distributed.checkpoint.format_utils.dcp_to_torch_save`

    Will save under `save_path` as either `model.safetensors` or `pytorch_model.bin`.
    """
    # Note: We import here to reduce import time from general modules, and isolate outside dependencies
    import torch.distributed.checkpoint as dist_cp
    import torch.distributed.checkpoint.format_utils as dist_cp_format_utils

    state_dict = {}
    save_path = Path(save_path)
    save_path.mkdir(exist_ok=True)
    dist_cp_format_utils._load_state_dict(
        state_dict,
        storage_reader=dist_cp.FileSystemReader(checkpoint_dir),
        planner=dist_cp_format_utils._EmptyStateDictLoadPlanner(),
        no_dist=True,
    )
    save_path = save_path / SAFE_WEIGHTS_NAME if safe_serialization else save_path / WEIGHTS_NAME

    # To handle if state is a dict like {model: {...}}
    if len(state_dict.keys()) == 1:
        state_dict = state_dict[list(state_dict)[0]]
    save(state_dict, save_path, safe_serialization=safe_serialization)
    return save_path


def merge_fsdp_weights(
    checkpoint_dir: str, output_path: str, safe_serialization: bool = True, remove_checkpoint_dir: bool = False
):
    """
    Merge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if
    `SHARDED_STATE_DICT` was used for the model. Weights will be saved to `{output_path}/model.safetensors` if
    `safe_serialization` else `pytorch_model.bin`.

    Note: this is a CPU-bound process.

    Args:
        checkpoint_dir (`str`):
            The directory containing the FSDP checkpoints (can be either the model or optimizer).
        output_path (`str`):
            The path to save the merged checkpoint.
        safe_serialization (`bool`, *optional*, defaults to `True`):
            Whether to save the merged weights with safetensors (recommended).
        remove_checkpoint_dir (`bool`, *optional*, defaults to `False`):
            Whether to remove the checkpoint directory after merging.
    """
    checkpoint_dir = Path(checkpoint_dir)
    from accelerate.state import PartialState

    if not is_torch_version(">=", "2.3.0"):
        raise ValueError("`merge_fsdp_weights` requires PyTorch >= 2.3.0`")

    # Verify that the checkpoint directory exists
    if not checkpoint_dir.exists():
        model_path_exists = (checkpoint_dir / "pytorch_model_fsdp_0").exists()
        optimizer_path_exists = (checkpoint_dir / "optimizer_0").exists()
        err = f"Tried to load from {checkpoint_dir} but couldn't find a valid metadata file."
        if model_path_exists and optimizer_path_exists:
            err += " However, potential model and optimizer checkpoint directories exist."
            err += f"Please pass in either {checkpoint_dir}/pytorch_model_fsdp_0 or {checkpoint_dir}/optimizer_0"
            err += "instead."
        elif model_path_exists:
            err += " However, a potential model checkpoint directory exists."
            err += f"Please try passing in {checkpoint_dir}/pytorch_model_fsdp_0 instead."
        elif optimizer_path_exists:
            err += " However, a potential optimizer checkpoint directory exists."
            err += f"Please try passing in {checkpoint_dir}/optimizer_0 instead."
        raise ValueError(err)

    # To setup `save` to work
    state = PartialState()
    if state.is_main_process:
        logger.info(f"Merging FSDP weights from {checkpoint_dir}")
        save_path = _distributed_checkpoint_to_merged_weights(checkpoint_dir, output_path, safe_serialization)
        logger.info(f"Successfully merged FSDP weights and saved to {save_path}")
        if remove_checkpoint_dir:
            logger.info(f"Removing old checkpoint directory {checkpoint_dir}")
            shutil.rmtree(checkpoint_dir)
    state.wait_for_everyone()


def ensure_weights_retied(param_init_fn, model: torch.nn.Module, device: torch.device):
    _tied_names = getattr(model, "_tied_weights_keys", None)
    if not _tied_names:
        # if no tied names just passthrough
        return param_init_fn

    # get map of parameter instances to params.
    # - needed for replacement later
    _tied_params = {}
    for name in _tied_names:
        name = name.split(".")
        name, param_name = ".".join(name[:-1]), name[-1]
        mod = model.get_submodule(name)
        param = getattr(mod, param_name)

        _tied_params[id(param)] = None  # placeholder for the param first

    # build param_init_fn for the case with tied params
    def param_init_fn_tied_param(module: torch.nn.Module):
        # track which params to tie
        # - usually only 1, but for completeness consider > 1
        params_to_tie = defaultdict(list)
        for n, param in module.named_parameters(recurse=False):
            if id(param) in _tied_params:
                params_to_tie[id(param)].append(n)

        # call the param init fn, which potentially re-allocates the
        # parameters
        module = param_init_fn(module)

        # search the parameters again and tie them up again
        for id_key, _param_names in params_to_tie.items():
            for param_name in _param_names:
                param = _tied_params[id_key]
                if param is None:
                    # everything will be tied to the first time the
                    # param is observed
                    _tied_params[id_key] = getattr(module, param_name)
                else:
                    setattr(module, param_name, param)  # tie

        return module

    return param_init_fn_tied_param


def fsdp2_load_full_state_dict(accelerator, model: torch.nn.Module, full_sd: dict, cpu_offload: bool = False):
    """
    Loads the full state dict (could be only on rank 0) into the sharded model. This is done by broadcasting the
    parameters from rank 0 to all other ranks. This function modifies the model in-place.

    Args:
        accelerator (`Accelerator`): The accelerator instance
        model (`torch.nn.Module`):
            The model to load the state dict into, expected to be on meta device or a VRAM spike can occur
        full_sd (`dict`): The full state dict to load, can only be on rank 0
        cpu_offload (`bool`, defaults to `False`):
            If True, move sharded parameters to CPU after distribution. Required when FSDP CPU offloading is enabled.
    """
    import torch.distributed as dist
    from torch.distributed.tensor import DTensor, distribute_tensor

    # Model was previously copied to meta device
    meta_sharded_sd = model.state_dict()
    sharded_sd = {}

    # Rank 0 distributes the full state dict to other ranks
    def _infer_parameter_dtype(model, param_name, empty_param):
        try:
            old_param = model.get_parameter_or_buffer(param_name)
        except AttributeError:
            # Need this for LORA, as there some params are not *parameters* of sorts
            base_param_name, local_param_name = param_name.rsplit(".", 1)
            submodule = model.get_submodule(base_param_name)
            old_param = getattr(submodule, local_param_name)

        is_torch_e4m3fn_available = hasattr(torch, "float8_e4m3fn")
        casting_dtype = None
        is_param_float8_e4m3fn = is_torch_e4m3fn_available and empty_param.dtype == torch.float8_e4m3fn

        if empty_param.dtype.is_floating_point and not is_param_float8_e4m3fn:
            casting_dtype = old_param.dtype

        return old_param is not None and old_param.is_contiguous(), casting_dtype

    def _cast_and_contiguous(tensor, to_contiguous, dtype):
        if dtype is not None:
            tensor = tensor.to(dtype=dtype)
        if to_contiguous:
            tensor = tensor.contiguous()
        return tensor

    if accelerator.is_main_process:
        for (param_name, full_param), sharded_param in zip(full_sd.items(), meta_sharded_sd.values()):
            device_mesh = sharded_param.device_mesh
            full_param = full_param.detach().to(device_mesh.device_type)
            if isinstance(full_param, DTensor):
                # dist.broadcast() only supports torch.Tensor.
                # After prepare_tp(), model parameters may become DTensor.
                # To broadcast such a parameter, convert it to a local tensor first.
                full_param = full_param.to_local()
            dist.broadcast(full_param, src=0, group=dist.group.WORLD)
            sharded_tensor = distribute_tensor(full_param, device_mesh, sharded_param.placements)
            to_contiguous, casting_dtype = _infer_parameter_dtype(
                model,
                param_name,
                full_param,
            )
            sharded_tensor = _cast_and_contiguous(sharded_tensor, to_contiguous, casting_dtype)
            # When CPU offloading is enabled, FSDP2's lazy_init expects parameters on CPU
            if cpu_offload:
                sharded_tensor = sharded_tensor.to("cpu")
            sharded_sd[param_name] = sharded_tensor
    # We need this else to have a matching `broadcast` for all of the ranks, else we deadlock
    else:
        for param_name, sharded_param in meta_sharded_sd.items():
            device_mesh = sharded_param.device_mesh
            full_tensor = torch.empty(sharded_param.size(), device=device_mesh.device_type, dtype=sharded_param.dtype)
            dist.broadcast(full_tensor, src=0, group=dist.group.WORLD)
            sharded_tensor = distribute_tensor(full_tensor, device_mesh, sharded_param.placements)
            to_contiguous, casting_dtype = _infer_parameter_dtype(
                model,
                param_name,
                full_tensor,
            )
            sharded_tensor = _cast_and_contiguous(sharded_tensor, to_contiguous, casting_dtype)
            # When CPU offloading is enabled, FSDP2's lazy_init expects parameters on CPU
            if cpu_offload:
                sharded_tensor = sharded_tensor.to("cpu")
            sharded_sd[param_name] = sharded_tensor

    # we set `assign=True` because our params are on meta device
    model.load_state_dict(sharded_sd, assign=True)
    return model


def fsdp2_switch_optimizer_parameters(optimizer: torch.optim.Optimizer, mapping: dict):
    """
    Switches the parameters of the optimizer to new ones (sharded parameters in usual case). This function modifies the
    optimizer in-place.

    Args:
        optimizer (`torch.optim.Optimizer`): Optimizer instance which contains the original model parameters
        mapping (`dict`): Mapping from the original parameter (specified by `data_ptr`) to the sharded parameter

    Raises:
        KeyError:
            If a parameter in the optimizer couldn't be switched to its sharded version. This should never happen and
            indicates a bug. If we kept the original params instead of raising, the training wouldn't be numerically
            correct and weights wouldn't get updated.
    """
    from torch.distributed.tensor import DTensor

    accessor_mapping = {}

    accessor_mapping[DTensor] = "_local_tensor"
    try:
        for param_group in optimizer.param_groups:
            param_group["params"] = [mapping[p.data_ptr] for p in param_group["params"]]
    except KeyError:
        # This shouldn't ever happen, but we want to fail here else training wouldn't be numerically correct
        # This basically means that we're missing a mapping from the original parameter to the sharded parameter
        raise KeyError(
            "A parameter in the optimizer couldn't be switched to its sharded version. This breaks the training. Please raise an issue on GitHub."
        )


def fsdp2_apply_ac(accelerator, model: torch.nn.Module):
    """
    Applies the activation checkpointing to the model.

    Args:
        accelerator (`Accelerator`): The accelerator instance
        model (`torch.nn.Module`): The model to apply the activation checkpointing to

    Returns:
        `torch.nn.Module`: The model with the activation checkpointing applied
    """

    from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
        checkpoint_wrapper,
    )

    auto_wrap_policy_func = fsdp2_prepare_auto_wrap_policy(accelerator.state.fsdp_plugin, model)

    for layer_name, layer in get_module_children_bottom_up(model, return_fqns=True)[:-1]:
        if len(layer_name.split(".")) > 1:
            parent_name, child_name = layer_name.rsplit(".", 1)
        else:
            parent_name = None
            child_name = layer_name

        parent_module = model.get_submodule(parent_name) if parent_name else model
        if auto_wrap_policy_func(parent_module):
            layer = checkpoint_wrapper(layer, preserve_rng_state=False)
            parent_module.register_module(child_name, layer)

    return model


def fsdp2_prepare_model(accelerator, model: torch.nn.Module) -> torch.nn.Module:
    """Prepares the model for FSDP2 in-place. Also returns the model to avoid misuse of the original model.

    Args:
        accelerator (`Accelerator`): The accelerator instance
        model (`torch.nn.Module`): The model to prepare

    Returns:
        `torch.nn.Module`: Prepared model
    """
    from torch.distributed.fsdp import FSDPModule, MixedPrecisionPolicy, fully_shard

    is_type_fsdp = isinstance(model, FSDPModule) or (
        is_compiled_module(model) and isinstance(model._orig_mod, FSDPModule)
    )
    if is_type_fsdp:
        return model

    fsdp2_plugin = accelerator.state.fsdp_plugin

    fsdp2_plugin.set_auto_wrap_policy(model)

    original_sd = model.state_dict()
    mesh = getattr(accelerator, "torch_device_mesh", None)

    fsdp2_kwargs = {
        "reshard_after_forward": fsdp2_plugin.reshard_after_forward,
        "offload_policy": fsdp2_plugin.cpu_offload,
        # `fully_shard` does not accept `None` in case of `MixedPrecisionPolicy`
        "mp_policy": fsdp2_plugin.mixed_precision_policy or MixedPrecisionPolicy(),
        "mesh": mesh[tuple(accelerator.parallelism_config.fsdp_dim_names)] if mesh is not None else None,
    }

    # `ignored_params` is only supported in torch >= 2.7.0
    if is_torch_version(">=", "2.7.0") and fsdp2_plugin.ignored_modules is not None:
        fsdp2_kwargs["ignored_params"] = get_parameters_from_modules(
            fsdp2_plugin.ignored_modules, model, accelerator.device
        )

    model_has_params4bit = False
    for name, param in model.named_parameters():
        # this is a temporary fix whereby loading models with bnb params cannot be moved from
        # GPU to a meta device due with FSDP2 because torch operations don't return the original class type
        # bypassing the move to meta will still cause the VRAM spike, but at least it still will load
        if param.__class__.__name__ == "Params4bit":
            model_has_params4bit = True
            break

    if fsdp2_plugin.cpu_ram_efficient_loading and not model_has_params4bit:
        # Context: `fully_shard` moves the model to GPU if it was on CPU, however it can also be on `meta` and then it stays there even after `fully_shard`
        # For this reason, we need to move the model to `meta` device, as then sharding happens on `meta` device
        # If we kept the model on CPU (`cpu_ram_efficient_loading` has model be on CPU on all ranks, though non-main ranks only have `torch.empty`), `fully_shard` would move it to GPU
        # Afterwards, when we call `fsdp2_load_full_state_dict`, us creating the state_dict would result into briefly having two copies of model state_dict on the GPU -> VRAM spike

        # We need to keep the original non-persistent buffers, as those MAY not be in the state_dict, resulting in them staying on meta device
        # Also, these buffers aren't getting sharded by default
        # We get the FQNs of all non-persistent buffers, to re-register them after
        non_persistent_buffer_fqns = get_non_persistent_buffers(model, recurse=True, fqns=True)
        original_non_persistent_buffers = copy.deepcopy(
            {k: v for k, v in model.named_buffers() if k in non_persistent_buffer_fqns}
        )
        # We move the model to meta device, as then sharding happens on meta device
        model = model.to(torch.device("meta"))
        # We need to re-tie the weights, not exactly sure why, but if we don't do this, reference to `lm_head/embed_tokens` stay hanging -> more VRAM usage
        # We assume `transformers` models have a `tie_weights` method if they support it
        if hasattr(model, "tie_weights"):
            model.tie_weights()

    auto_wrap_policy_func = fsdp2_prepare_auto_wrap_policy(fsdp2_plugin, model)
    if auto_wrap_policy_func is not None:
        # We skip the model itself, as that one is always wrapped
        for module in get_module_children_bottom_up(model)[:-1]:
            if auto_wrap_policy_func(module) and not isinstance(module, FSDPModule):
                fully_shard(module, **fsdp2_kwargs)

    if not isinstance(model, FSDPModule):
        fully_shard(model, **fsdp2_kwargs)

    if fsdp2_plugin.cpu_ram_efficient_loading and not model_has_params4bit:
        # If `cpu_ram_efficient_loading` is enabled, only rank 0 loads the weights
        # Other ranks have an empty model on `meta` device, so we need to distribute the weights properly
        # When CPU offloading is enabled, parameters need to stay on CPU after distribution
        from torch.distributed.fsdp import CPUOffloadPolicy

        fsdp2_load_full_state_dict(
            accelerator, model, original_sd, cpu_offload=isinstance(fsdp2_plugin.cpu_offload, CPUOffloadPolicy)
        )

    if fsdp2_plugin.cpu_ram_efficient_loading and not model_has_params4bit:
        # We re-register the buffers, as they may not be in the state_dict
        for fqn, buffer_tensor in original_non_persistent_buffers.items():
            buffer_tensor = buffer_tensor.to(accelerator.device)

            if "." in fqn:
                parent_fqn, local_buffer_name = fqn.rsplit(".", 1)
                parent_module = model.get_submodule(parent_fqn)
            else:
                local_buffer_name = fqn
                parent_module = model

            parent_module.register_buffer(local_buffer_name, buffer_tensor, persistent=False)

        # We need to tie the weights again, as call to `load_full_state_dict` breaks the tie
        # Needs to be called both here and above
        # removing this call makes the have slightly different loss
        # removing the call above leads to extra memory usage as explained in the comment above
        if hasattr(model, "tie_weights"):
            model.tie_weights()

    # There is no `dtype` attribution for nn.Module
    # Set it to None if it doesn't exist and do the upcast always
    model_dtype = getattr(model, "dtype", None)
    if accelerator.mixed_precision != "no" and (model_dtype is None or model_dtype != torch.float32):
        # We upcast the trainable parameters according to `deepspeed`'s implementation
        # More info about this can be found in `accelerator.py:prepare_model`s FSDP1 section
        upcasted_params = []
        for name, param in model.named_parameters():
            if param.requires_grad and param.dtype != torch.float32:
                upcasted_params.append(name)
                param = param.to(torch.float32)
        if accelerator.is_main_process and upcasted_params:
            warnings.warn(
                "FSDP upcast of low precision parameters to fp32 (since mixed_precision != 'no') may affect the precision of model checkpoints. "
                f"This effects {len(upcasted_params)} parameters: {upcasted_params}..."
            )
    return model


def fsdp2_prepare_auto_wrap_policy(fsdp2_plugin, model: torch.nn.Module) -> Callable[[torch.nn.Module], bool]:
    """Prepares the auto wrap policy based on its type, done to mimic the behaviour of FSDP1 auto wrap policy.

    Args:
        fsdp2_plugin (`FullyShardedDataParallelPlugin`):
            Instance of `FullyShardedDataParallelPlugin` containing the configuration options
        auto_wrap_policy_type (`str`):
            Either `transformer` or `size`
        model (`torch.nn.Module`):
            The model to wrap

    Returns:
        `Callable[[torch.nn.Module], bool]`:
            The auto wrap policy function to be applied to the model
    """
    from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy, transformer_auto_wrap_policy

    fn = fsdp2_plugin.auto_wrap_policy

    if isinstance(fn, functools.partial):
        fn = fn.func

    if fn is transformer_auto_wrap_policy:
        no_split_modules = getattr(model, "_no_split_modules", None)
        if no_split_modules is None:
            no_split_modules = []
        transformer_cls_names_to_wrap = list(no_split_modules)
        if fsdp2_plugin.transformer_cls_names_to_wrap is not None:
            transformer_cls_names_to_wrap = fsdp2_plugin.transformer_cls_names_to_wrap
        transformer_cls_to_wrap = set()

        for layer_class in transformer_cls_names_to_wrap:
            transformer_cls = get_module_class_from_name(model, layer_class)
            if transformer_cls is None:
                raise ValueError(f"Could not find the transformer layer class {layer_class} in the model.")
            transformer_cls_to_wrap.add(transformer_cls)

        def policy(module: torch.nn.Module) -> bool:
            if fsdp2_plugin.transformer_cls_names_to_wrap is None:
                return False
            return isinstance(module, tuple(transformer_cls_to_wrap))

    elif fn is size_based_auto_wrap_policy:

        def policy(module: torch.nn.Module) -> bool:
            module_num_params = sum(p.numel() for p in module.parameters())
            return module_num_params > fsdp2_plugin.min_num_params
    else:
        return None

    return policy


def get_fsdp2_grad_scaler(**kwargs):
    """
    Returns a `GradScaler` for FSDP2, as the current implementation of `get_grad_scaler` doesn't accept other args. We
    need this as current `get_grad_scaler` accepts only `distributed_type` as arg, which doesn't differentiate between
    FSDP1 and FSDP2
    """
    from torch.amp.grad_scaler import GradScaler

    return GradScaler(**kwargs)


def fsdp2_canonicalize_names(named_params: dict) -> dict:
    """Removes parameter name modifiers in order to map them back to their original names.

    See huggingface/accelerate#3554 for more context.

    Args:
        named_params (`dict`): The named parameters dictionary to canonicalize.

    Returns:
        `dict`: The canonicalized named parameters dictionary
    """
    named_params = {k.replace("._checkpoint_wrapped_module", ""): v for k, v in named_params.items()}
    named_params = {
        k.replace("_orig_mod.", "") if k.startswith("_orig_mod.") else k: v for k, v in named_params.items()
    }
    named_params = {k.replace("._orig_mod", ""): v for k, v in named_params.items()}
    return named_params


def get_parameters_from_modules(
    modules: Union[Iterable[torch.nn.Module], str], model, device
) -> set[torch.nn.Parameter]:
    """Converts modules to parameters where modules can be a string or list of torch.nn.Module

    Args:
        modules (`Union[Iterable[torch.nn.Module], str]`): List of modules

    Returns:
        `set[torch.nn.Parameter]`: List of parameters
    """
    if modules is None:
        return set()
    parameters = []
    # code taken from accelerate while preparing kwargs for FSDP
    if isinstance(modules, str):
        reg = re.compile(modules)
        mapped_modules = []
        for name, module in model.named_modules():
            if reg.fullmatch(name):
                module.to(device)
                mapped_modules.append(module)
        modules = mapped_modules
    for module in modules:
        parameters.extend(list(module.parameters()))
    return set(parameters)


================================================
FILE: src/accelerate/utils/imports.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import importlib
import importlib.metadata
import os
import sys
import warnings
from functools import lru_cache, wraps

import torch
from packaging import version
from packaging.version import parse

from .environment import parse_flag_from_env, patch_environment, str_to_bool
from .versions import compare_versions, is_torch_version


# Try to run Torch native job in an environment with TorchXLA installed by setting this value to 0.
USE_TORCH_XLA = parse_flag_from_env("USE_TORCH_XLA", default=True)

_torch_xla_available = False
if USE_TORCH_XLA:
    try:
        import torch_xla.core.xla_model as xm  # noqa: F401
        import torch_xla.runtime

        _torch_xla_available = True
    except ImportError:
        pass

# Keep it for is_tpu_available. It will be removed along with is_tpu_available.
_tpu_available = _torch_xla_available

# Cache this result has it's a C FFI call which can be pretty time-consuming
_torch_distributed_available = torch.distributed.is_available()


def _is_package_available(pkg_name, metadata_name=None):
    # Check we're not importing a "pkg_name" directory somewhere but the actual library by trying to grab the version
    package_exists = importlib.util.find_spec(pkg_name) is not None
    if package_exists:
        try:
            # Some libraries have different names in the metadata
            _ = importlib.metadata.metadata(pkg_name if metadata_name is None else metadata_name)
            return True
        except importlib.metadata.PackageNotFoundError:
            return False


def is_torch_distributed_available() -> bool:
    return _torch_distributed_available


def is_xccl_available():
    if is_torch_version(">=", "2.7.0"):
        return torch.distributed.distributed_c10d.is_xccl_available()
    return False


def is_import_timer_available():
    return _is_package_available("import_timer")


def is_pynvml_available():
    return _is_package_available("pynvml") or _is_package_available("pynvml", "nvidia-ml-py")


def is_pytest_available():
    return _is_package_available("pytest")


def is_msamp_available():
    return _is_package_available("msamp", "ms-amp")


def is_schedulefree_available():
    return _is_package_available("schedulefree")


def is_transformer_engine_available():
    if is_hpu_available():
        return _is_package_available("intel_transformer_engine", "intel-transformer-engine")
    else:
        return _is_package_available("transformer_engine", "transformer-engine")


def is_transformer_engine_mxfp8_available():
    if _is_package_available("transformer_engine", "transformer-engine"):
        from transformer_engine.pytorch.fp8 import check_mxfp8_support

        return check_mxfp8_support()[0]
    return False


def is_lomo_available():
    return _is_package_available("lomo_optim")


def is_cuda_available():
    """
    Checks if `cuda` is available via an `nvml-based` check which won't trigger the drivers and leave cuda
    uninitialized.
    """
    with patch_environment(PYTORCH_NVML_BASED_CUDA_CHECK="1"):
        available = torch.cuda.is_available()

    return available


@lru_cache
def is_torch_xla_available(check_is_tpu=False, check_is_gpu=False):
    """
    Check if `torch_xla` is available. To train a native pytorch job in an environment with torch xla installed, set
    the USE_TORCH_XLA to false.
    """
    assert not (check_is_tpu and check_is_gpu), "The check_is_tpu and check_is_gpu cannot both be true."

    if not _torch_xla_available:
        return False
    elif check_is_gpu:
        return torch_xla.runtime.device_type() in ["GPU", "CUDA"]
    elif check_is_tpu:
        return torch_xla.runtime.device_type() == "TPU"

    return True


def is_torchao_available():
    package_exists = _is_package_available("torchao")
    if package_exists:
        torchao_version = version.parse(importlib.metadata.version("torchao"))
        return compare_versions(torchao_version, ">=", "0.6.1")
    return False


def is_deepspeed_available():
    return _is_package_available("deepspeed")


def is_pippy_available():
    return is_torch_version(">=", "2.4.0")


def is_bf16_available(ignore_tpu=False):
    "Checks if bf16 is supported, optionally ignoring the TPU"
    if is_torch_xla_available(check_is_tpu=True):
        return not ignore_tpu
    if is_cuda_available():
        return torch.cuda.is_bf16_supported()
    if is_mlu_available():
        return torch.mlu.is_bf16_supported()
    if is_xpu_available():
        return torch.xpu.is_bf16_supported()
    if is_mps_available():
        return torch.backends.mps.is_macos_or_newer(14, 0)
    return True


def is_fp16_available():
    "Checks if fp16 is supported"
    if is_habana_gaudi1():
        return False

    return True


def is_fp8_available():
    "Checks if fp8 is supported"
    return is_msamp_available() or is_transformer_engine_available() or is_torchao_available()


def is_4bit_bnb_available():
    package_exists = _is_package_available("bitsandbytes")
    if package_exists:
        bnb_version = version.parse(importlib.metadata.version("bitsandbytes"))
        return compare_versions(bnb_version, ">=", "0.39.0")
    return False


def is_8bit_bnb_available():
    package_exists = _is_package_available("bitsandbytes")
    if package_exists:
        bnb_version = version.parse(importlib.metadata.version("bitsandbytes"))
        return compare_versions(bnb_version, ">=", "0.37.2")
    return False


def is_bnb_available(min_version=None):
    package_exists = _is_package_available("bitsandbytes")
    if package_exists and min_version is not None:
        bnb_version = version.parse(importlib.metadata.version("bitsandbytes"))
        return compare_versions(bnb_version, ">=", min_version)
    else:
        return package_exists


def is_bitsandbytes_multi_backend_available():
    if not is_bnb_available():
        return False
    import bitsandbytes as bnb

    return "multi_backend" in getattr(bnb, "features", set())


def is_torchvision_available():
    return _is_package_available("torchvision")


def is_megatron_lm_available():
    if str_to_bool(os.environ.get("ACCELERATE_USE_MEGATRON_LM", "False")) == 1:
        if importlib.util.find_spec("megatron") is not None:
            try:
                megatron_version = parse(importlib.metadata.version("megatron-core"))
                if compare_versions(megatron_version, ">=", "0.8.0"):
                    return importlib.util.find_spec(".training", "megatron")
            except Exception as e:
                warnings.warn(f"Parse Megatron version failed. Exception:{e}")
                return False


def is_transformers_available():
    return _is_package_available("transformers")


def is_datasets_available():
    return _is_package_available("datasets")


def is_peft_available():
    return _is_package_available("peft")


def is_timm_available():
    return _is_package_available("timm")


def is_triton_available():
    if is_xpu_available():
        return _is_package_available("triton", "pytorch-triton-xpu")
    return _is_package_available("triton")


def is_aim_available():
    package_exists = _is_package_available("aim")
    if package_exists:
        aim_version = version.parse(importlib.metadata.version("aim"))
        return compare_versions(aim_version, "<", "4.0.0")
    return False


def is_tensorboard_available():
    return _is_package_available("tensorboard") or _is_package_available("tensorboardX")


def is_wandb_available():
    return _is_package_available("wandb")


def is_comet_ml_available():
    return _is_package_available("comet_ml")


def is_swanlab_available():
    return _is_package_available("swanlab")


def is_trackio_available():
    return sys.version_info >= (3, 10) and _is_package_available("trackio")


def is_boto3_available():
    return _is_package_available("boto3")


def is_rich_available():
    if _is_package_available("rich"):
        return parse_flag_from_env("ACCELERATE_ENABLE_RICH", False)
    return False


def is_sagemaker_available():
    return _is_package_available("sagemaker")


def is_tqdm_available():
    return _is_package_available("tqdm")


def is_clearml_available():
    return _is_package_available("clearml")


def is_pandas_available():
    return _is_package_available("pandas")


def is_matplotlib_available():
    return _is_package_available("matplotlib")


def is_mlflow_available():
    if _is_package_available("mlflow"):
        return True

    if importlib.util.find_spec("mlflow") is not None:
        try:
            _ = importlib.metadata.metadata("mlflow-skinny")
            return True
        except importlib.metadata.PackageNotFoundError:
            return False
    return False


def is_mps_available(min_version="1.12"):
    "Checks if MPS device is available. The minimum version required is 1.12."
    # With torch 1.12, you can use torch.backends.mps
    # With torch 2.0.0, you can use torch.mps
    return is_torch_version(">=", min_version) and torch.backends.mps.is_available() and torch.backends.mps.is_built()


@lru_cache
def is_mlu_available(check_device=False):
    """
    Checks if `mlu` is available via an `cndev-based` check which won't trigger the drivers and leave mlu
    uninitialized.
    """
    if importlib.util.find_spec("torch_mlu") is None:
        return False

    import torch_mlu  # noqa: F401

    with patch_environment(PYTORCH_CNDEV_BASED_MLU_CHECK="1"):
        available = torch.mlu.is_available()

    return available


@lru_cache
def is_musa_available(check_device=False):
    "Checks if `torch_musa` is installed and potentially if a MUSA is in the environment"
    if importlib.util.find_spec("torch_musa") is None:
        return False

    import torch_musa  # noqa: F401

    if check_device:
        try:
            # Will raise a RuntimeError if no MUSA is found
            _ = torch.musa.device_count()
            return torch.musa.is_available()
        except RuntimeError:
            return False
    return hasattr(torch, "musa") and torch.musa.is_available()


@lru_cache
def is_npu_available(check_device=False):
    "Checks if `torch_npu` is installed and potentially if a NPU is in the environment"
    if importlib.util.find_spec("torch_npu") is None:
        return False

    # NOTE: importing torch_npu may raise error in some envs
    # e.g. inside cpu-only container with torch_npu installed
    try:
        import torch_npu  # noqa: F401
    except Exception:
        return False

    if check_device:
        try:
            # Will raise a RuntimeError if no NPU is found
            _ = torch.npu.device_count()
            return torch.npu.is_available()
        except RuntimeError:
            return False
    return hasattr(torch, "npu") and torch.npu.is_available()


@lru_cache
def is_sdaa_available(check_device=False):
    "Checks if `torch_sdaa` is installed and potentially if a SDAA is in the environment"
    if importlib.util.find_spec("torch_sdaa") is None:
        return False

    import torch_sdaa  # noqa: F401

    if check_device:
        try:
            # Will raise a RuntimeError if no NPU is found
            _ = torch.sdaa.device_count()
            return torch.sdaa.is_available()
        except RuntimeError:
            return False
    return hasattr(torch, "sdaa") and torch.sdaa.is_available()


@lru_cache
def is_hpu_available(init_hccl=False):
    "Checks if `torch.hpu` is installed and potentially if a HPU is in the environment"
    if (
        importlib.util.find_spec("habana_frameworks") is None
        or importlib.util.find_spec("habana_frameworks.torch") is None
    ):
        return False

    import habana_frameworks.torch  # noqa: F401

    if init_hccl:
        import habana_frameworks.torch.distributed.hccl as hccl  # noqa: F401

    return hasattr(torch, "hpu") and torch.hpu.is_available()


def is_habana_gaudi1():
    if is_hpu_available():
        import habana_frameworks.torch.utils.experimental as htexp  # noqa: F401

        if htexp._get_device_type() == htexp.synDeviceType.synDeviceGaudi:
            return True

    return False


@lru_cache
def is_xpu_available(check_device=False):
    """
    Checks if XPU acceleration is available via stock PyTorch (>=2.7) and
    potentially if a XPU is in the environment
    """

    if is_torch_version("<=", "2.6"):
        return False

    if check_device:
        try:
            # Will raise a RuntimeError if no XPU is found
            _ = torch.xpu.device_count()
            return torch.xpu.is_available()
        except RuntimeError:
            return False
    return hasattr(torch, "xpu") and torch.xpu.is_available()


@lru_cache
def is_neuron_available(check_device=False):
    if importlib.util.find_spec("torch_neuronx") is None:
        return False

    if check_device:
        try:
            import torch_neuronx  # noqa: F401

            # Will raise a RuntimeError if no Neuron is found
            _ = torch.neuron.device_count()
            return torch.neuron.is_available()
        except RuntimeError:
            return False

    return hasattr(torch, "neuron") and torch.neuron.is_available()


def is_dvclive_available():
    return _is_package_available("dvclive")


def is_torchdata_available():
    return _is_package_available("torchdata")


# TODO: Remove this function once stateful_dataloader is a stable feature in torchdata.
def is_torchdata_stateful_dataloader_available():
    package_exists = _is_package_available("torchdata")
    if package_exists:
        torchdata_version = version.parse(importlib.metadata.version("torchdata"))
        return compare_versions(torchdata_version, ">=", "0.8.0")
    return False


def torchao_required(func):
    """
    A decorator that ensures the decorated function is only called when torchao is available.
    """

    @wraps(func)
    def wrapper(*args, **kwargs):
        if not is_torchao_available():
            raise ImportError(
                "`torchao` is not available, please install it before calling this function via `pip install torchao`."
            )
        return func(*args, **kwargs)

    return wrapper


# TODO: Rework this into `utils.deepspeed` and migrate the "core" chunks into `accelerate.deepspeed`
def deepspeed_required(func):
    """
    A decorator that ensures the decorated function is only called when deepspeed is enabled.
    """

    @wraps(func)
    def wrapper(*args, **kwargs):
        from accelerate.state import AcceleratorState
        from accelerate.utils.dataclasses import DistributedType

        if AcceleratorState._shared_state != {} and AcceleratorState().distributed_type != DistributedType.DEEPSPEED:
            raise ValueError(
                "DeepSpeed is not enabled, please make sure that an `Accelerator` is configured for `deepspeed` "
                "before calling this function."
            )
        return func(*args, **kwargs)

    return wrapper


def is_weights_only_available():
    # Weights only with allowlist was added in 2.4.0
    # ref: https://github.com/pytorch/pytorch/pull/124331
    return is_torch_version(">=", "2.4.0")


def is_numpy_available(min_version="1.25.0"):
    numpy_version = parse(importlib.metadata.version("numpy"))
    return compare_versions(numpy_version, ">=", min_version)


================================================
FILE: src/accelerate/utils/launch.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import os
import subprocess
import sys
import warnings
from ast import literal_eval
from shutil import which
from typing import Any

import torch

from ..commands.config.config_args import SageMakerConfig
from ..utils import (
    DynamoBackend,
    PrecisionType,
    is_fp8_available,
    is_hpu_available,
    is_mlu_available,
    is_musa_available,
    is_neuron_available,
    is_npu_available,
    is_sdaa_available,
    is_torch_xla_available,
    is_xpu_available,
)
from ..utils.constants import DEEPSPEED_MULTINODE_LAUNCHERS
from ..utils.other import get_free_port, is_port_in_use, merge_dicts
from ..utils.versions import compare_versions
from . import parse_flag_from_env
from .dataclasses import DistributedType, SageMakerDistributedType


def _filter_args(args, parser, default_args=[]):
    """
    Filters out all `accelerate` specific args
    """
    new_args, _ = parser.parse_known_args(default_args)
    for key, value in vars(args).items():
        if key in vars(new_args).keys():
            setattr(new_args, key, value)
    return new_args


def _get_mpirun_args():
    """
    Determines the executable and argument names for mpirun, based on the type of install. The supported MPI programs
    are: OpenMPI, Intel MPI, or MVAPICH.

    Returns: Program name and arg names for hostfile, num processes, and processes per node
    """
    # Find the MPI program name
    mpi_apps = [x for x in ["mpirun", "mpiexec"] if which(x)]

    if len(mpi_apps) == 0:
        raise OSError("mpirun or mpiexec were not found. Ensure that Intel MPI, Open MPI, or MVAPICH are installed.")

    # Call the app with the --version flag to determine which MPI app is installed
    mpi_app = mpi_apps[0]
    mpirun_version = subprocess.check_output([mpi_app, "--version"])

    if b"Open MPI" in mpirun_version:
        return mpi_app, "--hostfile", "-n", "--npernode", "--bind-to"
    else:
        # Intel MPI and MVAPICH both use the same arg names
        return mpi_app, "-f", "-n", "-ppn", ""


def setup_fp8_env(args: argparse.Namespace, current_env: dict[str, str]):
    """
    Setup the FP8 environment variables.
    """
    prefix = "ACCELERATE_"
    for arg in vars(args):
        if arg.startswith("fp8_"):
            value = getattr(args, arg)
            if value is not None:
                if arg == "fp8_override_linear_precision":
                    current_env[prefix + "FP8_OVERRIDE_FPROP"] = str(value[0])
                    current_env[prefix + "FP8_OVERRIDE_DGRAD"] = str(value[1])
                    current_env[prefix + "FP8_OVERRIDE_WGRAD"] = str(value[2])
                else:
                    current_env[f"{prefix}{arg.upper()}"] = str(getattr(args, arg))
    return current_env


def prepare_simple_launcher_cmd_env(args: argparse.Namespace) -> tuple[list[str], dict[str, str]]:
    """
    Prepares and returns the command list and an environment with the correct simple launcher environment variables.
    """
    cmd = []
    if args.no_python and args.module:
        raise ValueError("--module and --no_python cannot be used together")

    num_processes = getattr(args, "num_processes", None)
    num_machines = args.num_machines
    if args.mpirun_hostfile is not None:
        mpi_app_name, hostfile_arg, num_proc_arg, proc_per_node_arg, bind_to_arg = _get_mpirun_args()
        bind_to = getattr(args, "bind-to", "socket")
        nproc_per_node = str(num_processes // num_machines) if num_processes and num_machines else "1"
        cmd += [
            mpi_app_name,
            hostfile_arg,
            args.mpirun_hostfile,
            proc_per_node_arg,
            nproc_per_node,
        ]
        if num_processes:
            cmd += [num_proc_arg, str(num_processes)]
        if bind_to_arg:
            cmd += [bind_to_arg, bind_to]
    if not args.no_python:
        cmd.append(sys.executable)
        if args.module:
            cmd.append("-m")
    cmd.append(args.training_script)
    cmd.extend(args.training_script_args)

    current_env = os.environ.copy()
    current_env["ACCELERATE_USE_CPU"] = str(args.cpu or args.use_cpu)
    if args.debug:
        current_env["ACCELERATE_DEBUG_MODE"] = "true"
    if args.gpu_ids != "all" and args.gpu_ids is not None:
        if is_xpu_available():
            current_env["ZE_AFFINITY_MASK"] = args.gpu_ids
        elif is_mlu_available():
            current_env["MLU_VISIBLE_DEVICES"] = args.gpu_ids
        elif is_sdaa_available():
            current_env["SDAA_VISIBLE_DEVICES"] = args.gpu_ids
        elif is_musa_available():
            current_env["MUSA_VISIBLE_DEVICES"] = args.gpu_ids
        elif is_npu_available():
            current_env["ASCEND_RT_VISIBLE_DEVICES"] = args.gpu_ids
        elif is_hpu_available():
            current_env["HABANA_VISIBLE_MODULES"] = args.gpu_ids
        elif is_neuron_available():
            current_env["NEURON_RT_VISIBLE_CORES"] = args.gpu_ids
        else:
            current_env["CUDA_VISIBLE_DEVICES"] = args.gpu_ids
    if num_machines > 1:
        assert args.main_process_ip is not None, (
            "When using multiple machines, you need to specify the main process IP."
        )
        assert args.main_process_port is not None, (
            "When using multiple machines, you need to specify the main process port."
        )

    if (num_processes is not None and num_processes > 1) or num_machines > 1:
        current_env["MASTER_ADDR"] = args.main_process_ip if args.main_process_ip is not None else "127.0.0.1"
        current_env["MASTER_PORT"] = str(args.main_process_port) if args.main_process_port is not None else "29500"
    if parse_flag_from_env(current_env["ACCELERATE_USE_CPU"], False):
        current_env["KMP_AFFINITY"] = "granularity=fine,compact,1,0"
        current_env["KMP_BLOCKTIME"] = str(1)

    try:
        mixed_precision = PrecisionType(args.mixed_precision.lower())
    except ValueError:
        raise ValueError(
            f"Unknown mixed_precision mode: {args.mixed_precision.lower()}. Choose between {PrecisionType.list()}."
        )

    current_env["ACCELERATE_MIXED_PRECISION"] = str(mixed_precision)
    if args.mixed_precision.lower() == "fp8":
        if not is_fp8_available():
            raise RuntimeError(
                "FP8 is not available on this machine. Please ensure that either Transformer Engine, MSAMP or torchao is installed."
            )
        current_env = setup_fp8_env(args, current_env)

    try:
        dynamo_backend = DynamoBackend(args.dynamo_backend.upper())
    except ValueError:
        raise ValueError(
            f"Unknown dynamo backend: {args.dynamo_backend.upper()}. Choose between {DynamoBackend.list()}."
        )
    current_env["ACCELERATE_DYNAMO_BACKEND"] = dynamo_backend.value
    current_env["ACCELERATE_DYNAMO_MODE"] = args.dynamo_mode
    current_env["ACCELERATE_DYNAMO_USE_FULLGRAPH"] = str(args.dynamo_use_fullgraph)
    current_env["ACCELERATE_DYNAMO_USE_DYNAMIC"] = str(args.dynamo_use_dynamic)
    current_env["ACCELERATE_DYNAMO_USE_REGIONAL_COMPILATION"] = str(args.dynamo_use_regional_compilation)

    current_env["OMP_NUM_THREADS"] = str(args.num_cpu_threads_per_process)
    if args.enable_cpu_affinity:
        current_env["ACCELERATE_CPU_AFFINITY"] = "1"
    return cmd, current_env


def prepare_multi_gpu_env(args: argparse.Namespace) -> dict[str, str]:
    """
    Prepares and returns an environment with the correct multi-GPU environment variables.
    """
    # get free port and update configurations
    if args.main_process_port == 0:
        args.main_process_port = get_free_port()

    elif args.main_process_port is None:
        args.main_process_port = 29500

    num_processes = args.num_processes
    num_machines = args.num_machines
    main_process_ip = args.main_process_ip
    main_process_port = args.main_process_port
    if num_machines > 1:
        args.nproc_per_node = str(num_processes // num_machines)
        args.nnodes = str(num_machines)
        args.node_rank = int(args.machine_rank)
        if getattr(args, "same_network", False):
            args.master_addr = str(main_process_ip)
            args.master_port = str(main_process_port)
        else:
            args.rdzv_endpoint = f"{main_process_ip}:{main_process_port}"
    else:
        args.nproc_per_node = str(num_processes)
        if main_process_port is not None:
            args.master_port = str(main_process_port)

    # only need to check port availability in main process, in case we have to start multiple launchers on the same machine
    # for some reasons like splitting log files.
    need_port_check = num_machines <= 1 or int(args.machine_rank) == 0
    if need_port_check and is_port_in_use(main_process_port):
        if num_machines <= 1:
            args.standalone = True
            warnings.warn(
                f"Port `{main_process_port}` is already in use. "
                "Accelerate will attempt to launch in a standalone-like mode by finding an open port automatically for this session. "
                "If this current attempt fails, or for more control in future runs, please specify a different port "
                "(e.g., `--main_process_port <your_chosen_port>`) or use `--main_process_port 0` for automatic selection "
                "in your launch command or Accelerate config file."
            )
        else:
            raise ConnectionError(
                f"Tried to launch distributed communication on port `{main_process_port}`, but another process is utilizing it. "
                "Please specify a different port (such as using the `--main_process_port` flag or specifying a different `main_process_port` in your config file)"
                " and rerun your script. To automatically use the next open port (on a single node), you can set this to `0`."
            )

    if args.module and args.no_python:
        raise ValueError("--module and --no_python cannot be used together")
    elif args.module:
        args.module = True
    elif args.no_python:
        args.no_python = True

    current_env = os.environ.copy()
    if args.debug:
        current_env["ACCELERATE_DEBUG_MODE"] = "true"
    gpu_ids = getattr(args, "gpu_ids", "all")
    if gpu_ids != "all" and args.gpu_ids is not None:
        if is_xpu_available():
            current_env["ZE_AFFINITY_MASK"] = gpu_ids
        elif is_mlu_available():
            current_env["MLU_VISIBLE_DEVICES"] = gpu_ids
        elif is_sdaa_available():
            current_env["SDAA_VISIBLE_DEVICES"] = gpu_ids
        elif is_musa_available():
            current_env["MUSA_VISIBLE_DEVICES"] = gpu_ids
        elif is_npu_available():
            current_env["ASCEND_RT_VISIBLE_DEVICES"] = gpu_ids
        elif is_hpu_available():
            current_env["HABANA_VISIBLE_MODULES"] = gpu_ids
        elif is_neuron_available():
            current_env["NEURON_RT_VISIBLE_CORES"] = gpu_ids
        else:
            current_env["CUDA_VISIBLE_DEVICES"] = gpu_ids
    mixed_precision = args.mixed_precision.lower()
    try:
        mixed_precision = PrecisionType(mixed_precision)
    except ValueError:
        raise ValueError(f"Unknown mixed_precision mode: {mixed_precision}. Choose between {PrecisionType.list()}.")

    current_env["ACCELERATE_MIXED_PRECISION"] = str(mixed_precision)
    if args.mixed_precision.lower() == "fp8":
        if not is_fp8_available():
            raise RuntimeError(
                "FP8 is not available on this machine. Please ensure that either Transformer Engine, MSAMP or torchao is installed."
            )
        current_env = setup_fp8_env(args, current_env)

    try:
        dynamo_backend = DynamoBackend(args.dynamo_backend.upper())
    except ValueError:
        raise ValueError(
            f"Unknown dynamo backend: {args.dynamo_backend.upper()}. Choose between {DynamoBackend.list()}."
        )
    current_env["ACCELERATE_DYNAMO_BACKEND"] = dynamo_backend.value
    current_env["ACCELERATE_DYNAMO_MODE"] = args.dynamo_mode
    current_env["ACCELERATE_DYNAMO_USE_FULLGRAPH"] = str(args.dynamo_use_fullgraph)
    current_env["ACCELERATE_DYNAMO_USE_DYNAMIC"] = str(args.dynamo_use_dynamic)
    current_env["ACCELERATE_DYNAMO_USE_REGIONAL_COMPILATION"] = str(args.dynamo_use_regional_compilation)

    if args.use_fsdp:
        current_env["ACCELERATE_USE_FSDP"] = "true"
        if args.fsdp_cpu_ram_efficient_loading and not args.fsdp_sync_module_states:
            raise ValueError("When using `--fsdp_cpu_ram_efficient_loading` set `--fsdp_sync_module_states` to `True`")

        current_env["FSDP_VERSION"] = str(args.fsdp_version) if hasattr(args, "fsdp_version") else "1"

        # For backwards compatibility, we support this in launched scripts,
        # however, we do not ask users for this in `accelerate config` CLI
        current_env["FSDP_SHARDING_STRATEGY"] = str(args.fsdp_sharding_strategy)

        current_env["FSDP_RESHARD_AFTER_FORWARD"] = str(args.fsdp_reshard_after_forward).lower()
        current_env["FSDP_OFFLOAD_PARAMS"] = str(args.fsdp_offload_params).lower()
        current_env["FSDP_MIN_NUM_PARAMS"] = str(args.fsdp_min_num_params)
        if args.fsdp_auto_wrap_policy is not None:
            current_env["FSDP_AUTO_WRAP_POLICY"] = str(args.fsdp_auto_wrap_policy)
        if args.fsdp_transformer_layer_cls_to_wrap is not None:
            current_env["FSDP_TRANSFORMER_CLS_TO_WRAP"] = str(args.fsdp_transformer_layer_cls_to_wrap)
        if args.fsdp_backward_prefetch is not None:
            current_env["FSDP_BACKWARD_PREFETCH"] = str(args.fsdp_backward_prefetch)
        if args.fsdp_state_dict_type is not None:
            current_env["FSDP_STATE_DICT_TYPE"] = str(args.fsdp_state_dict_type)
        current_env["FSDP_FORWARD_PREFETCH"] = str(args.fsdp_forward_prefetch).lower()
        current_env["FSDP_USE_ORIG_PARAMS"] = str(args.fsdp_use_orig_params).lower()
        current_env["FSDP_CPU_RAM_EFFICIENT_LOADING"] = str(args.fsdp_cpu_ram_efficient_loading).lower()
        current_env["FSDP_SYNC_MODULE_STATES"] = str(args.fsdp_sync_module_states).lower()
        current_env["FSDP_ACTIVATION_CHECKPOINTING"] = str(args.fsdp_activation_checkpointing).lower()
        if getattr(args, "fsdp_ignored_modules", None) is not None:
            current_env["FSDP_IGNORED_MODULES"] = str(args.fsdp_ignored_modules)

    if args.use_megatron_lm:
        prefix = "MEGATRON_LM_"
        current_env["ACCELERATE_USE_MEGATRON_LM"] = "true"
        current_env[prefix + "TP_DEGREE"] = str(args.megatron_lm_tp_degree)
        current_env[prefix + "USE_CUSTOM_FSDP"] = str(args.megatron_lm_use_custom_fsdp)
        if args.megatron_lm_no_load_optim is not None:
            current_env[prefix + "NO_LOAD_OPTIM"] = str(args.megatron_lm_no_load_optim)
        if args.megatron_lm_eod_mask_loss is not None:
            current_env[prefix + "EOD_MASK_LOSS"] = str(args.megatron_lm_eod_mask_loss)
        if args.megatron_lm_no_save_optim is not None:
            current_env[prefix + "NO_SAVE_OPTIM"] = str(args.megatron_lm_no_save_optim)
        if args.megatron_lm_optimizer_cpu_offload is not None:
            current_env[prefix + "OPTIMIZER_CPU_OFFLOAD"] = str(args.megatron_lm_optimizer_cpu_offload)
        if args.megatron_lm_use_precision_aware_optimizer is not None:
            current_env[prefix + "USE_PRECISION_AWARE_OPTIMIZER"] = str(args.megatron_lm_use_precision_aware_optimizer)
        if args.megatron_lm_overlap_cpu_optimizer_d2h_h2d is not None:
            current_env[prefix + "OVERLAP_CPU_OPTIMIZER_D2H_H2D"] = str(args.megatron_lm_overlap_cpu_optimizer_d2h_h2d)
        if args.megatron_lm_decoder_last_pipeline_num_layers is not None:
            current_env[prefix + "DECODER_LAST_PIPELINE_NUM_LAYERS"] = str(
                args.megatron_lm_decoder_last_pipeline_num_layers
            )
        current_env[prefix + "PP_DEGREE"] = str(args.megatron_lm_pp_degree)
        current_env[prefix + "GRADIENT_CLIPPING"] = str(args.megatron_lm_gradient_clipping)
        if args.megatron_lm_num_micro_batches is not None:
            current_env[prefix + "NUM_MICRO_BATCHES"] = str(args.megatron_lm_num_micro_batches)
        if args.megatron_lm_sequence_parallelism is not None:
            current_env[prefix + "SEQUENCE_PARALLELISM"] = str(args.megatron_lm_sequence_parallelism)
        if args.megatron_lm_recompute_activations is not None:
            current_env[prefix + "RECOMPUTE_ACTIVATIONS"] = str(args.megatron_lm_recompute_activations)
        if args.megatron_lm_use_distributed_optimizer is not None:
            current_env[prefix + "USE_DISTRIBUTED_OPTIMIZER"] = str(args.megatron_lm_use_distributed_optimizer)
        if args.megatron_lm_recompute_granularity is not None:
            current_env[prefix + "RECOMPUTE_GRANULARITY"] = str(args.megatron_lm_recompute_granularity)
        if args.megatron_lm_recompute_method is not None:
            current_env[prefix + "RECOMPUTE_METHOD"] = str(args.megatron_lm_recompute_method)
        if args.megatron_lm_recompute_num_layers is not None:
            current_env[prefix + "RECOMPUTE_NUM_LAYERS"] = str(args.megatron_lm_recompute_num_layers)
        if args.megatron_lm_attention_backend is not None:
            current_env[prefix + "ATTENTION_BACKEND"] = str(args.megatron_lm_attention_backend)
        if args.megatron_lm_expert_model_parallel_size is not None:
            current_env[prefix + "EXPERT_MODEL_PARALLEL_SIZE"] = str(args.megatron_lm_expert_model_parallel_size)
        if args.megatron_lm_context_parallel_size is not None:
            current_env[prefix + "CONTEXT_PARALLEL_SIZE"] = str(args.megatron_lm_context_parallel_size)
        if args.megatron_lm_attention_dropout is not None:
            current_env[prefix + "ATTENTION_DROPOUT"] = str(args.megatron_lm_attention_dropout)
        if args.megatron_lm_hidden_dropout is not None:
            current_env[prefix + "HIDDEN_DROPOUT"] = str(args.megatron_lm_hidden_dropout)
        if args.megatron_lm_attention_softmax_in_fp32 is not None:
            current_env[prefix + "ATTENTION_SOFTMAX_IN_FP32"] = str(args.megatron_lm_attention_softmax_in_fp32)
        if args.megatron_lm_expert_tensor_parallel_size is not None:
            current_env[prefix + "EXPERT_TENSOR_PARALLEL_SIZE"] = str(args.megatron_lm_expert_tensor_parallel_size)
        if args.megatron_lm_calculate_per_token_loss is not None:
            current_env[prefix + "CALCULATE_PER_TOKEN_LOSS"] = str(args.megatron_lm_calculate_per_token_loss)
        if args.megatron_lm_use_rotary_position_embeddings is not None:
            current_env[prefix + "USE_ROTARY_POSITION_EMBEDDINGS"] = str(
                args.megatron_lm_use_rotary_position_embeddings
            )

    current_env["OMP_NUM_THREADS"] = str(args.num_cpu_threads_per_process)
    if args.enable_cpu_affinity:
        current_env["ACCELERATE_CPU_AFFINITY"] = "1"

    if args.use_parallelism_config:
        current_env = prepare_extend_env_parallelism_config(args, current_env)

    return current_env


def prepare_extend_env_parallelism_config(
    args: argparse.Namespace, current_env: dict
) -> tuple[list[str], dict[str, str]]:
    """
    Extends `current_env` with context parallelism env vars if any have been set
    """

    prefix = "PARALLELISM_CONFIG_"

    current_env["ACCELERATE_USE_PARALLELISM_CONFIG"] = "true"
    current_env[prefix + "DP_REPLICATE_SIZE"] = str(args.parallelism_config_dp_replicate_size)
    current_env[prefix + "DP_SHARD_SIZE"] = str(args.parallelism_config_dp_shard_size)
    current_env[prefix + "TP_SIZE"] = str(args.parallelism_config_tp_size)
    current_env[prefix + "CP_SIZE"] = str(args.parallelism_config_cp_size)
    current_env[prefix + "CP_BACKEND"] = str(args.parallelism_config_cp_backend)
    current_env[prefix + "SP_SIZE"] = str(args.parallelism_config_sp_size)
    current_env[prefix + "SP_BACKEND"] = str(args.parallelism_config_sp_backend)
    if args.parallelism_config_cp_size > 1:
        current_env[prefix + "CP_COMM_STRATEGY"] = str(args.parallelism_config_cp_comm_strategy)
    if args.parallelism_config_sp_size > 1:
        current_env[prefix + "SP_SEQ_LENGTH"] = str(args.parallelism_config_sp_seq_length)
        current_env[prefix + "SP_SEQ_LENGTH_IS_VARIABLE"] = str(args.parallelism_config_sp_seq_length_is_variable)
        current_env[prefix + "SP_ATTN_IMPLEMENTATION"] = str(args.parallelism_config_sp_attn_implementation)

    return current_env


def prepare_deepspeed_cmd_env(args: argparse.Namespace) -> tuple[list[str], dict[str, str]]:
    """
    Prepares and returns the command list and an environment with the correct DeepSpeed environment variables.
    """
    # get free port and update configurations
    if args.main_process_port == 0:
        args.main_process_port = get_free_port()

    elif args.main_process_port is None:
        args.main_process_port = 29500

    num_processes = args.num_processes
    num_machines = args.num_machines
    main_process_ip = args.main_process_ip
    main_process_port = args.main_process_port
    cmd = None

    # make sure launcher is not None
    if args.deepspeed_multinode_launcher is None:
        # set to default pdsh
        args.deepspeed_multinode_launcher = DEEPSPEED_MULTINODE_LAUNCHERS[0]

    if num_machines > 1 and args.deepspeed_multinode_launcher != DEEPSPEED_MULTINODE_LAUNCHERS[1]:
        cmd = ["deepspeed"]
        cmd.extend(["--hostfile", str(args.deepspeed_hostfile)])
        if args.deepspeed_multinode_launcher == "nossh":
            if compare_versions("deepspeed", "<", "0.14.5"):
                raise ValueError("nossh launcher requires DeepSpeed >= 0.14.5")
            cmd.extend(["--node_rank", str(args.machine_rank), "--no_ssh"])
        else:
            cmd.extend(["--no_local_rank", "--launcher", str(args.deepspeed_multinode_launcher)])
        if args.deepspeed_exclusion_filter is not None:
            cmd.extend(
                [
                    "--exclude",
                    str(args.deepspeed_exclusion_filter),
                ]
            )
        elif args.deepspeed_inclusion_filter is not None:
            cmd.extend(
                [
                    "--include",
                    str(args.deepspeed_inclusion_filter),
                ]
            )
        else:
            cmd.extend(["--num_gpus", str(args.num_processes // args.num_machines)])
        if main_process_ip:
            cmd.extend(["--master_addr", str(main_process_ip)])
        cmd.extend(["--master_port", str(main_process_port)])
        if args.module and args.no_python:
            raise ValueError("--module and --no_python cannot be used together")
        elif args.module:
            cmd.append("--module")
        elif args.no_python:
            cmd.append("--no_python")
        cmd.append(args.training_script)
        cmd.extend(args.training_script_args)
    elif num_machines > 1 and args.deepspeed_multinode_launcher == DEEPSPEED_MULTINODE_LAUNCHERS[1]:
        args.nproc_per_node = str(num_processes // num_machines)
        args.nnodes = str(num_machines)
        args.node_rank = int(args.machine_rank)
        if getattr(args, "same_network", False):
            args.master_addr = str(main_process_ip)
            args.master_port = str(main_process_port)
        else:
            args.rdzv_endpoint = f"{main_process_ip}:{main_process_port}"
    else:
        args.nproc_per_node = str(num_processes)
        if main_process_port is not None:
            args.master_port = str(main_process_port)

    # only need to check port availability in main process, in case we have to start multiple launchers on the same machine
    # for some reasons like splitting log files.
    need_port_check = num_machines <= 1 or int(args.machine_rank) == 0
    if need_port_check and is_port_in_use(main_process_port):
        if num_machines <= 1:
            args.standalone = True
            warnings.warn(
                f"Port `{main_process_port}` is already in use. "
                "Accelerate will attempt to launch in a standalone-like mode by finding an open port automatically for this session. "
                "If this current attempt fails, or for more control in future runs, please specify a different port "
                "(e.g., `--main_process_port <your_chosen_port>`) or use `--main_process_port 0` for automatic selection "
                "in your launch command or Accelerate config file."
            )
        else:
            raise ConnectionError(
                f"Tried to launch distributed communication on port `{main_process_port}`, but another process is utilizing it. "
                "Please specify a different port (such as using the `--main_process_port` flag or specifying a different `main_process_port` in your config file)"
                " and rerun your script. To automatically use the next open port (on a single node), you can set this to `0`."
            )

    if args.module and args.no_python:
        raise ValueError("--module and --no_python cannot be used together")
    elif args.module:
        args.module = True
    elif args.no_python:
        args.no_python = True

    current_env = os.environ.copy()
    if args.debug:
        current_env["ACCELERATE_DEBUG_MODE"] = "true"
    gpu_ids = getattr(args, "gpu_ids", "all")
    if gpu_ids != "all" and args.gpu_ids is not None:
        if is_xpu_available():
            current_env["ZE_AFFINITY_MASK"] = gpu_ids
        elif is_mlu_available():
            current_env["MLU_VISIBLE_DEVICES"] = gpu_ids
        elif is_sdaa_available():
            current_env["SDAA_VISIBLE_DEVICES"] = gpu_ids
        elif is_musa_available():
            current_env["MUSA_VISIBLE_DEVICES"] = gpu_ids
        elif is_npu_available():
            current_env["ASCEND_RT_VISIBLE_DEVICES"] = gpu_ids
        elif is_hpu_available():
            current_env["HABANA_VISIBLE_MODULES"] = gpu_ids
        elif is_neuron_available():
            current_env["NEURON_RT_VISIBLE_CORES"] = gpu_ids
        else:
            current_env["CUDA_VISIBLE_DEVICES"] = gpu_ids
    try:
        mixed_precision = PrecisionType(args.mixed_precision.lower())
    except ValueError:
        raise ValueError(
            f"Unknown mixed_precision mode: {args.mixed_precision.lower()}. Choose between {PrecisionType.list()}."
        )

    current_env["PYTHONPATH"] = env_var_path_add("PYTHONPATH", os.path.abspath("."))
    current_env["ACCELERATE_MIXED_PRECISION"] = str(mixed_precision)
    if args.mixed_precision.lower() == "fp8":
        if not is_fp8_available():
            raise RuntimeError(
                "FP8 is not available on this machine. Please ensure that either Transformer Engine, MSAMP or torchao is installed."
            )
        current_env = setup_fp8_env(args, current_env)
    current_env["ACCELERATE_CONFIG_DS_FIELDS"] = str(args.deepspeed_fields_from_accelerate_config).lower()
    current_env["ACCELERATE_USE_DEEPSPEED"] = "true"
    if args.zero_stage is not None:
        current_env["ACCELERATE_DEEPSPEED_ZERO_STAGE"] = str(args.zero_stage)
    if args.gradient_accumulation_steps is not None:
        current_env["ACCELERATE_GRADIENT_ACCUMULATION_STEPS"] = str(args.gradient_accumulation_steps)
    if args.gradient_clipping is not None:
        current_env["ACCELERATE_GRADIENT_CLIPPING"] = str(args.gradient_clipping).lower()
    if args.offload_optimizer_device is not None:
        current_env["ACCELERATE_DEEPSPEED_OFFLOAD_OPTIMIZER_DEVICE"] = str(args.offload_optimizer_device).lower()
    if args.offload_param_device is not None:
        current_env["ACCELERATE_DEEPSPEED_OFFLOAD_PARAM_DEVICE"] = str(args.offload_param_device).lower()
    if args.zero3_init_flag is not None:
        current_env["ACCELERATE_DEEPSPEED_ZERO3_INIT"] = str(args.zero3_init_flag).lower()
    if args.zero3_save_16bit_model is not None:
        current_env["ACCELERATE_DEEPSPEED_ZERO3_SAVE_16BIT_MODEL"] = str(args.zero3_save_16bit_model).lower()
    if args.deepspeed_config_file is not None:
        current_env["ACCELERATE_DEEPSPEED_CONFIG_FILE"] = str(args.deepspeed_config_file)
    if args.enable_cpu_affinity:
        current_env["ACCELERATE_CPU_AFFINITY"] = "1"
    if args.deepspeed_moe_layer_cls_names is not None:
        current_env["ACCELERATE_DEEPSPEED_MOE_LAYER_CLS_NAMES"] = str(args.deepspeed_moe_layer_cls_names)

    if args.use_parallelism_config:
        current_env = prepare_extend_env_parallelism_config(args, current_env)

    return cmd, current_env


def prepare_tpu(
    args: argparse.Namespace, current_env: dict[str, str], pod: bool = False
) -> tuple[argparse.Namespace, dict[str, str]]:
    """
    Prepares and returns an environment with the correct TPU environment variables.
    """
    if args.mixed_precision == "bf16" and is_torch_xla_available(check_is_tpu=True):
        if args.downcast_bf16:
            current_env["XLA_DOWNCAST_BF16"] = "1"
        else:
            current_env["XLA_USE_BF16"] = "1"
    if args.debug:
        current_env["ACCELERATE_DEBUG_MODE"] = "true"
    if pod:
        # Take explicit args and set them up for XLA
        args.vm = args.tpu_vm
        args.tpu = args.tpu_name
    return args, current_env


def _convert_nargs_to_dict(nargs: list[str]) -> dict[str, str]:
    if len(nargs) < 0:
        return {}
    # helper function to infer type for argsparser

    def _infer_type(s):
        try:
            s = float(s)

            if s // 1 == s:
                return int(s)
            return s
        except ValueError:
            return s

    parser = argparse.ArgumentParser()
    _, unknown = parser.parse_known_args(nargs)
    for index, argument in enumerate(unknown):
        if argument.startswith(("-", "--")):
            action = None
            if index + 1 < len(unknown):  # checks if next index would be in list
                if unknown[index + 1].startswith(("-", "--")):  # checks if next element is an key
                    # raise an error if element is store_true or store_false
                    raise ValueError(
                        "SageMaker doesn’t support argparse actions for `store_true` or `store_false`. Please define explicit types"
                    )
            else:  # raise an error if last element is store_true or store_false
                raise ValueError(
                    "SageMaker doesn’t support argparse actions for `store_true` or `store_false`. Please define explicit types"
                )
            # adds argument to parser based on action_store true
            if action is None:
                parser.add_argument(argument, type=_infer_type)
            else:
                parser.add_argument(argument, action=action)

    return {
        key: (literal_eval(value) if value in ("True", "False") else value)
        for key, value in parser.parse_args(nargs).__dict__.items()
    }


def prepare_sagemager_args_inputs(
    sagemaker_config: SageMakerConfig, args: argparse.Namespace
) -> tuple[argparse.Namespace, dict[str, Any]]:
    # configure environment
    print("Configuring Amazon SageMaker environment")
    os.environ["AWS_DEFAULT_REGION"] = sagemaker_config.region

    # configure credentials
    if sagemaker_config.profile is not None:
        os.environ["AWS_PROFILE"] = sagemaker_config.profile
    elif args.aws_access_key_id is not None and args.aws_secret_access_key is not None:
        os.environ["AWS_ACCESS_KEY_ID"] = args.aws_access_key_id
        os.environ["AWS_SECRET_ACCESS_KEY"] = args.aws_secret_access_key
    else:
        raise OSError("You need to provide an aws_access_key_id and aws_secret_access_key when not using aws_profile")

    # extract needed arguments
    source_dir = os.path.dirname(args.training_script)
    if not source_dir:  # checks if string is empty
        source_dir = "."
    entry_point = os.path.basename(args.training_script)
    if not entry_point.endswith(".py"):
        raise ValueError(f'Your training script should be a python script and not "{entry_point}"')

    print("Converting Arguments to Hyperparameters")
    hyperparameters = _convert_nargs_to_dict(args.training_script_args)

    try:
        mixed_precision = PrecisionType(args.mixed_precision.lower())
    except ValueError:
        raise ValueError(
            f"Unknown mixed_precision mode: {args.mixed_precision.lower()}. Choose between {PrecisionType.list()}."
        )

    try:
        dynamo_backend = DynamoBackend(args.dynamo_backend.upper())
    except ValueError:
        raise ValueError(
            f"Unknown dynamo backend: {args.dynamo_backend.upper()}. Choose between {DynamoBackend.list()}."
        )

    # Environment variables to be set for use during training job
    environment = {
        "ACCELERATE_USE_SAGEMAKER": "true",
        "ACCELERATE_MIXED_PRECISION": str(mixed_precision),
        "ACCELERATE_DYNAMO_BACKEND": dynamo_backend.value,
        "ACCELERATE_DYNAMO_MODE": args.dynamo_mode,
        "ACCELERATE_DYNAMO_USE_FULLGRAPH": str(args.dynamo_use_fullgraph),
        "ACCELERATE_DYNAMO_USE_DYNAMIC": str(args.dynamo_use_dynamic),
        "ACCELERATE_DYNAMO_USE_REGIONAL_COMPILATION": str(args.dynamo_use_regional_compilation),
        "ACCELERATE_SAGEMAKER_DISTRIBUTED_TYPE": sagemaker_config.distributed_type.value,
    }
    if args.mixed_precision.lower() == "fp8":
        if not is_fp8_available():
            raise RuntimeError(
                "FP8 is not available on this machine. Please ensure that either Transformer Engine, MSAMP or torchao is installed."
            )
        environment = setup_fp8_env(args, environment)
    # configure distribution set up
    distribution = None
    if sagemaker_config.distributed_type == SageMakerDistributedType.DATA_PARALLEL:
        distribution = {"smdistributed": {"dataparallel": {"enabled": True}}}

    # configure sagemaker inputs
    sagemaker_inputs = None
    if sagemaker_config.sagemaker_inputs_file is not None:
        print(f"Loading SageMaker Inputs from {sagemaker_config.sagemaker_inputs_file} file")
        sagemaker_inputs = {}
        with open(sagemaker_config.sagemaker_inputs_file) as file:
            for i, line in enumerate(file):
                if i == 0:
                    continue
                l = line.split("\t")
                sagemaker_inputs[l[0]] = l[1].strip()
        print(f"Loaded SageMaker Inputs: {sagemaker_inputs}")

    # configure sagemaker metrics
    sagemaker_metrics = None
    if sagemaker_config.sagemaker_metrics_file is not None:
        print(f"Loading SageMaker Metrics from {sagemaker_config.sagemaker_metrics_file} file")
        sagemaker_metrics = []
        with open(sagemaker_config.sagemaker_metrics_file) as file:
            for i, line in enumerate(file):
                if i == 0:
                    continue
                l = line.split("\t")
                metric_dict = {
                    "Name": l[0],
                    "Regex": l[1].strip(),
                }
                sagemaker_metrics.append(metric_dict)
        print(f"Loaded SageMaker Metrics: {sagemaker_metrics}")

    # configure session
    print("Creating Estimator")
    args = {
        "image_uri": sagemaker_config.image_uri,
        "entry_point": entry_point,
        "source_dir": source_dir,
        "role": sagemaker_config.iam_role_name,
        "transformers_version": sagemaker_config.transformers_version,
        "pytorch_version": sagemaker_config.pytorch_version,
        "py_version": sagemaker_config.py_version,
        "base_job_name": sagemaker_config.base_job_name,
        "instance_count": sagemaker_config.num_machines,
        "instance_type": sagemaker_config.ec2_instance_type,
        "debugger_hook_config": False,
        "distribution": distribution,
        "hyperparameters": hyperparameters,
        "environment": environment,
        "metric_definitions": sagemaker_metrics,
    }

    if sagemaker_config.additional_args is not None:
        args = merge_dicts(sagemaker_config.additional_args, args)
    return args, sagemaker_inputs


def env_var_path_add(env_var_name, path_to_add):
    """
    Extends a path-based environment variable's value with a new path and returns the updated value. It's up to the
    caller to set it in os.environ.
    """
    paths = [p for p in os.environ.get(env_var_name, "").split(":") if len(p) > 0]
    paths.append(str(path_to_add))
    return ":".join(paths)


class PrepareForLaunch:
    """
    Prepare a function that will launched in a distributed setup.

    Args:
        launcher (`Callable`):
            The function to launch.
        distributed_type ([`~state.DistributedType`]):
            The distributed type to prepare for.
        debug (`bool`, *optional*, defaults to `False`):
            Whether or not this is a debug launch.
    """

    def __init__(self, launcher, distributed_type="NO", debug=False):
        self.launcher = launcher
        self.distributed_type = DistributedType(distributed_type)
        self.debug = debug

    def __call__(self, index, *args):
        if self.debug:
            world_size = int(os.environ.get("WORLD_SIZE"))
            rdv_file = os.environ.get("ACCELERATE_DEBUG_RDV_FILE")
            torch.distributed.init_process_group(
                "gloo",
                rank=index,
                store=torch.distributed.FileStore(rdv_file, world_size),
                world_size=world_size,
            )
        elif self.distributed_type in (
            DistributedType.MULTI_GPU,
            DistributedType.MULTI_MLU,
            DistributedType.MULTI_MUSA,
            DistributedType.MULTI_NPU,
            DistributedType.MULTI_XPU,
            DistributedType.MULTI_CPU,
            DistributedType.MULTI_NEURON,
        ):
            # Prepare the environment for torch.distributed
            os.environ["LOCAL_RANK"] = str(index)
            nproc = int(os.environ.get("NPROC", 1))
            node_rank = int(os.environ.get("NODE_RANK", 0))
            os.environ["RANK"] = str(nproc * node_rank + index)

        os.environ["FORK_LAUNCHED"] = str(1)
        self.launcher(*args)


================================================
FILE: src/accelerate/utils/megatron_lm.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import math
import os
from abc import ABC
from functools import partial

import torch
import torch.nn.functional as F
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from ..optimizer import AcceleratedOptimizer
from ..scheduler import AcceleratedScheduler
from .imports import is_megatron_lm_available
from .operations import recursively_apply, send_to_device


if is_megatron_lm_available():
    from megatron.core import mpu, tensor_parallel
    from megatron.core.distributed import DistributedDataParallel as LocalDDP
    from megatron.core.distributed import finalize_model_grads
    from megatron.core.enums import ModelType
    from megatron.core.num_microbatches_calculator import get_num_microbatches
    from megatron.core.optimizer import get_megatron_optimizer
    from megatron.core.parallel_state import get_tensor_model_parallel_group, get_tensor_model_parallel_src_rank
    from megatron.core.pipeline_parallel import get_forward_backward_func
    from megatron.core.utils import get_model_config
    from megatron.legacy.data.dataset_utils import build_train_valid_test_datasets
    from megatron.legacy.model import BertModel, T5Model
    from megatron.legacy.model.classification import Classification
    from megatron.training import (
        get_args,
        get_tensorboard_writer,
        get_tokenizer,
        print_rank_last,
    )
    from megatron.training.arguments import (
        _add_data_args,
        _add_validation_args,
        core_transformer_config_from_args,
        parse_args,
        validate_args,
    )
    from megatron.training.checkpointing import load_args_from_checkpoint, load_checkpoint, save_checkpoint
    from megatron.training.global_vars import set_global_variables
    from megatron.training.gpt_builders import gpt_builder
    from megatron.training.initialize import (
        _compile_dependencies,
        _init_autoresume,
        _initialize_distributed,
        _set_random_seed,
        set_jit_fusion_options,
        write_args_to_tensorboard,
    )
    from megatron.training.tokenizer.tokenizer import _vocab_size_with_padding
    from megatron.training.training import (
        build_train_valid_test_data_iterators,
        get_optimizer_param_scheduler,
        num_floating_point_operations,
        setup_model_and_optimizer,
        train_step,
        training_log,
    )
    from megatron.training.utils import (
        average_losses_across_data_parallel_group,
        calc_params_l2_norm,
        get_ltor_masks_and_position_ids,
    )


# model utilities
def model_provider_func(pre_process=True, post_process=True, add_encoder=True, add_decoder=True):
    """Build the model."""
    args = get_args()
    mode = "pre-training" if args.pretraining_flag else "fine-tuning"
    if args.rank == 0:
        print(f"Building {args.model_type_name} model in the {mode} mode.")
        print(
            "The Megatron LM model weights are initialized at random in `accelerator.prepare`. "
            "Please use `accelerator.load_checkpoint` to load a pre-trained checkpoint matching the distributed setup."
        )
    config = core_transformer_config_from_args(args)
    if args.model_type_name == "bert":
        if args.pretraining_flag:
            num_tokentypes = 2 if args.bert_binary_head else 0
            model = BertModel(
                config=config,
                num_tokentypes=num_tokentypes,
                add_binary_head=args.bert_binary_head,
                parallel_output=True,
                pre_process=pre_process,
                post_process=post_process,
            )
        else:
            model = Classification(
                config=config,
                num_classes=args.num_labels,
                num_tokentypes=2,
                pre_process=pre_process,
                post_process=post_process,
            )
    elif args.model_type_name == "gpt":
        # use the latest gpt builder to build the model and set use_legacy_models to False
        args.use_legacy_models = False
        model = gpt_builder(args, pre_process, post_process, vp_stage=None, config=None)
    elif args.model_type_name == "t5":
        model = T5Model(
            config=config,
            num_tokentypes=0,
            parallel_output=True,
            pre_process=pre_process,
            post_process=post_process,
            add_encoder=add_encoder,
            add_decoder=add_decoder,
        )
    else:
        raise ValueError(f"Unsupported model type: {args.model_type_name}")
    return model


def prepare_model_optimizer_scheduler(accelerator):
    accelerator.print("Preparing model optimizer scheduler")
    args = get_args()
    if accelerator.state.megatron_lm_plugin.custom_prepare_model_function is not None:
        if accelerator.state.megatron_lm_plugin.custom_model_provider_function is None:
            raise ValueError(
                "You must provide a `custom_model_provider_function` when using a `custom_prepare_model_function`."
            )
        custom_model_provider_func = accelerator.state.megatron_lm_plugin.custom_model_provider_function
        model = accelerator.state.megatron_lm_plugin.custom_prepare_model_function(custom_model_provider_func)
        optimizer = prepare_optimizer(accelerator, model)
        scheduler = prepare_scheduler(accelerator, optimizer, scheduler=None)
    else:
        model_type = ModelType.encoder_or_decoder
        if args.model_type_name == "t5":
            model_type = ModelType.encoder_and_decoder
        model_provider_func_ = model_provider_func
        if accelerator.state.megatron_lm_plugin.custom_model_provider_function is not None:
            model_provider_func_ = accelerator.state.megatron_lm_plugin.custom_model_provider_function
        (model, optimizer, scheduler) = setup_model_and_optimizer(
            model_provider_func_,
            model_type,
        )
    args.model_len = len(model)
    return model, optimizer, scheduler


# dataloader utilities
class MegatronLMDummyDataLoader:
    """
    Dummy dataloader presents model parameters or param groups, this is primarily used to follow conventional training

    Args:
        **dataset_kwargs: Megatron data arguments.
    """

    def __init__(self, **dataset_kwargs):
        parser = argparse.ArgumentParser()
        parser = _add_data_args(parser)
        parser = _add_validation_args(parser)
        data_args = parser.parse_known_args()
        self.dataset_args = vars(data_args[0])
        self.dataset_args.update(dataset_kwargs)
        self.dataset_args["megatron_dataset_flag"] = True

    def set_megatron_data_args(self):
        args = get_args()
        for key, value in self.dataset_args.items():
            old_value = getattr(args, key, "")
            if old_value != value:
                print(
                    f"WARNING: MegatronLMDummyDataLoader overriding arguments for {key}:{old_value} with {key}:{value}"
                )
            setattr(args, key, value)

    def get_train_valid_test_datasets_provider(self, accelerator):
        def train_valid_test_datasets_provider(train_val_test_num_samples):
            """Build train, valid, and test datasets."""
            args = get_args()
            dataset_args = {
                "data_prefix": args.data_path if isinstance(args.data_path, (list, tuple)) else [args.data_path],
                "splits_string": args.split,
                "train_valid_test_num_samples": train_val_test_num_samples,
                "seed": args.seed,
            }
            if args.model_type_name == "bert":
                dataset_args.update(
                    {
                        "max_seq_length": args.seq_length,
                        "binary_head": args.bert_binary_head,
                    }
                )
            elif args.model_type_name == "gpt":
                dataset_args.update(
                    {
                        "max_seq_length": args.seq_length,
                    }
                )
            elif args.model_type_name == "t5":
                dataset_args.update(
                    {
                        "max_seq_length": args.encoder_seq_length,
                        "max_seq_length_dec": args.decoder_seq_length,
                        "dataset_type": "t5",
                    }
                )
            else:
                raise ValueError(f"Unsupported model type: {args.model_type_name}")
            train_ds, valid_ds, test_ds = build_train_valid_test_datasets(**dataset_args)
            return train_ds, valid_ds, test_ds

        if accelerator.state.megatron_lm_plugin.custom_megatron_datasets_provider_function is not None:
            return accelerator.state.megatron_lm_plugin.custom_megatron_datasets_provider_function
        try:
            args = get_args()
            # Use '--no-use-pep517 -e' to pip install nvidia's megatron from source
            if args.model_type_name == "bert":
                from pretrain_bert import train_valid_test_datasets_provider

                train_valid_test_datasets_provider.is_distributed = True
                return train_valid_test_datasets_provider
            elif args.model_type_name == "gpt":
                from pretrain_gpt import train_valid_test_datasets_provider

                train_valid_test_datasets_provider.is_distributed = True
                return train_valid_test_datasets_provider
            elif args.model_type_name == "t5":
                from pretrain_t5 import train_valid_test_datasets_provider

                train_valid_test_datasets_provider.is_distributed = True
                return train_valid_test_datasets_provider
        except ImportError:
            pass
        return train_valid_test_datasets_provider

    def build_train_valid_test_data_iterators(self, accelerator):
        args = get_args()

        train_valid_test_dataset_provider = self.get_train_valid_test_datasets_provider(accelerator)
        if args.virtual_pipeline_model_parallel_size is not None:
            train_data_iterator = []
            valid_data_iterator = []
            test_data_iterator = []
            for i in range(getattr(args, "model_len", 0)):
                mpu.set_virtual_pipeline_model_parallel_rank(i)
                iterators = build_train_valid_test_data_iterators(train_valid_test_dataset_provider)
                train_data_iterator.append(iterators[0])
                valid_data_iterator.append(iterators[1])
                test_data_iterator.append(iterators[2])
        else:
            train_data_iterator, valid_data_iterator, test_data_iterator = build_train_valid_test_data_iterators(
                train_valid_test_dataset_provider
            )

        return train_data_iterator, valid_data_iterator, test_data_iterator


def _handle_megatron_data_iterator(accelerator, data_iterator):
    class DummyMegatronDataloader:
        def __iter__(self):
            return self

        def __next__(self):
            return {}

    is_data_iterator_empty = data_iterator is None
    is_src_data_iterator_empty = torch.tensor(is_data_iterator_empty, dtype=torch.bool, device=accelerator.device)
    torch.distributed.broadcast(
        is_src_data_iterator_empty, get_tensor_model_parallel_src_rank(), group=get_tensor_model_parallel_group()
    )
    if not is_src_data_iterator_empty and is_data_iterator_empty:
        return DummyMegatronDataloader()
    return data_iterator


def prepare_data_loader(accelerator, dataloader):
    accelerator.print("Preparing dataloader")
    args = get_args()
    if not args.megatron_dataset_flag:
        from ..data_loader import _PYTORCH_DATALOADER_KWARGS, prepare_data_loader

        micro_batch_size = args.micro_batch_size * args.num_micro_batches
        kwargs = {k: getattr(dataloader, k, _PYTORCH_DATALOADER_KWARGS[k]) for k in _PYTORCH_DATALOADER_KWARGS}
        if kwargs["batch_size"] is None:
            if isinstance(kwargs["sampler"], torch.utils.data.BatchSampler):
                kwargs["sampler"].batch_size = micro_batch_size
            else:
                del kwargs["sampler"]
                del kwargs["shuffle"]
                del kwargs["batch_size"]
                kwargs["batch_sampler"].batch_size = micro_batch_size
        else:
            del kwargs["batch_sampler"]
            kwargs["batch_size"] = micro_batch_size

        dataloader = torch.utils.data.DataLoader(dataloader.dataset, **kwargs)
        # split_batches:
        # Megatron only needs to fetch different data between different dp groups,
        # and does not need to split the data within the dp group.
        return prepare_data_loader(
            dataloader,
            accelerator.device,
            num_processes=mpu.get_data_parallel_world_size(),
            process_index=mpu.get_data_parallel_rank(),
            split_batches=False,
            put_on_device=True,
            rng_types=accelerator.rng_types.copy(),
            dispatch_batches=accelerator.dispatch_batches,
        )
    else:
        if args.consumed_samples is not None:
            (
                args.consumed_train_samples,
                args.consumed_valid_samples,
                args.consumed_test_samples,
            ) = args.consumed_samples
        else:
            args.consumed_train_samples, args.consumed_valid_samples, args.consumed_test_samples = 0, 0, 0
        args.micro_batch_size = args.micro_batch_size * args.num_micro_batches
        # In order to be compatible with data in transform format,
        # it needs to increase the size of mbs first,
        # and then split the large batch data into some mbs.
        (
            train_data_iterator,
            valid_data_iterator,
            test_data_iterator,
        ) = dataloader.build_train_valid_test_data_iterators(accelerator)
        args.micro_batch_size = args.micro_batch_size // args.num_micro_batches

        train_data_iterator = _handle_megatron_data_iterator(
            accelerator=accelerator, data_iterator=train_data_iterator
        )
        valid_data_iterator = _handle_megatron_data_iterator(
            accelerator=accelerator, data_iterator=valid_data_iterator
        )
        test_data_iterator = _handle_megatron_data_iterator(accelerator=accelerator, data_iterator=test_data_iterator)

        return train_data_iterator, valid_data_iterator, test_data_iterator


# optimizer utilities
class MegatronLMOptimizerWrapper(AcceleratedOptimizer):
    def __init__(self, optimizer):
        super().__init__(optimizer, device_placement=False, scaler=None)

    def zero_grad(self, set_to_none=None):
        pass  # `model(**batch)` is doing that automatically. Therefore, its implementation is not needed

    def step(self):
        pass  # `model(**batch)` is doing that automatically. Therefore, its implementation is not needed

    @property
    def step_was_skipped(self):
        """Whether or not the optimizer step was done, or skipped because of gradient overflow."""
        return self.optimizer.skipped_iter


def prepare_optimizer(accelerator, model):
    accelerator.print("Preparing optimizer")
    args = get_args()
    return get_megatron_optimizer(model, args.no_wd_decay_cond, args.scale_lr_cond, args.lr_mult)


# scheduler utilities
class MegatronLMDummyScheduler:
    """
    Dummy scheduler presents model parameters or param groups, this is primarily used to follow conventional training
    loop when scheduler config is specified in the deepspeed config file.

    Args:
        optimizer (`torch.optim.optimizer.Optimizer`):
            The optimizer to wrap.
        total_num_steps (int):
            Total number of steps.
        warmup_num_steps (int):
            Number of steps for warmup.
        **kwargs (additional keyword arguments, *optional*):
            Other arguments.
    """

    def __init__(self, optimizer, total_num_steps=None, warmup_num_steps=0, **kwargs):
        self.optimizer = optimizer
        self.total_num_steps = total_num_steps
        self.warmup_num_steps = warmup_num_steps
        self.kwargs = kwargs


class MegatronLMSchedulerWrapper(AcceleratedScheduler):
    def __init__(self, scheduler, optimizers):
        super().__init__(scheduler, optimizers)

    def step(self, *args, **kwargs):
        return  # `model(**batch)` is doing that automatically. Therefore, its implementation is not needed


def prepare_scheduler(accelerator, optimizer, scheduler):
    accelerator.print("Preparing scheduler")
    scheduler = get_optimizer_param_scheduler(optimizer)
    return scheduler


class AbstractTrainStep(ABC):
    """Abstract class for batching, forward pass and loss handler."""

    def __init__(self, name):
        super().__init__()
        self.name = name

    def get_batch_func(self, accelerator, megatron_dataset_flag):
        pass

    def get_forward_step_func(self):
        pass

    def get_loss_func(self, accelerator):
        pass


class BertTrainStep(AbstractTrainStep):
    """
    Bert train step class.

    Args:
        args (`argparse.Namespace`): Megatron-LM arguments.
    """

    def __init__(self, accelerator, args):
        super().__init__("BertTrainStep")
        self.get_batch = self.get_batch_func(accelerator, args.megatron_dataset_flag)
        self.loss_func = self.get_loss_func(accelerator, args.pretraining_flag, args.num_labels)
        self.forward_step = self.get_forward_step_func(args.pretraining_flag, args.bert_binary_head)
        if not args.model_return_dict:
            self.model_output_class = None
        else:
            from transformers.modeling_outputs import SequenceClassifierOutput

            self.model_output_class = SequenceClassifierOutput

    def get_batch_func(self, accelerator, megatron_dataset_flag):
        def get_batch_megatron(data_iterator):
            """Build the batch."""

            # Items and their type.
            keys = ["text", "types", "labels", "is_random", "loss_mask", "padding_mask"]
            datatype = torch.int64

            # Broadcast data.
            if data_iterator is not None:
                data = next(data_iterator)
            else:
                data = None
            data_b = tensor_parallel.broadcast_data(keys, data, datatype)

            # Unpack.
            tokens = data_b["text"].long()
            types = data_b["types"].long()
            sentence_order = data_b["is_random"].long()
            loss_mask = data_b["loss_mask"].float()
            lm_labels = data_b["labels"].long()
            padding_mask = data_b["padding_mask"].long()

            return tokens, types, sentence_order, loss_mask, lm_labels, padding_mask

        def get_batch_transformer(data_iterator):
            """Build the batch."""
            data = next(data_iterator)
            data = send_to_device(data, torch.cuda.current_device())

            # Unpack.
            tokens = data["input_ids"].long()
            padding_mask = data["attention_mask"].long()
            if "token_type_ids" in data:
                types = data["token_type_ids"].long()
            else:
                types = None
            if "labels" in data:
                lm_labels = data["labels"].long()
                loss_mask = (data["labels"] != -100).to(torch.float)
            else:
                lm_labels = None
                loss_mask = None
            if "next_sentence_label" in data:
                sentence_order = data["next_sentence_label"].long()
            else:
                sentence_order = None

            return tokens, types, sentence_order, loss_mask, lm_labels, padding_mask

        if accelerator.state.megatron_lm_plugin.custom_get_batch_function is not None:
            return accelerator.state.megatron_lm_plugin.custom_get_batch_function
        if megatron_dataset_flag:
            try:
                # Use '--no-use-pep517 -e' to pip install nvidia's megatron from source
                from pretrain_bert import get_batch

                return get_batch
            except ImportError:
                pass
            return get_batch_megatron
        else:
            return get_batch_transformer

    def get_loss_func(self, accelerator, pretraining_flag, num_labels):
        def loss_func_pretrain(loss_mask, sentence_order, output_tensor):
            lm_loss_, sop_logits = output_tensor

            lm_loss_ = lm_loss_.float()
            loss_mask = loss_mask.float()
            lm_loss = torch.sum(lm_loss_.view(-1) * loss_mask.reshape(-1)) / loss_mask.sum()

            if sop_logits is not None:
                sop_loss = F.cross_entropy(sop_logits.view(-1, 2).float(), sentence_order.view(-1), ignore_index=-1)
                sop_loss = sop_loss.float()
                loss = lm_loss + sop_loss
                averaged_losses = average_losses_across_data_parallel_group([lm_loss, sop_loss])
                return loss, {"lm loss": averaged_losses[0], "sop loss": averaged_losses[1]}

            else:
                loss = lm_loss
                averaged_losses = average_losses_across_data_parallel_group([lm_loss])
                return loss, {"lm loss": averaged_losses[0]}

        def loss_func_finetune(labels, logits):
            if num_labels == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            elif self.num_labels > 1 and (labels.dtype in (torch.long, torch.int)):
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, num_labels), labels.view(-1))
            else:
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)
            averaged_losses = average_losses_across_data_parallel_group([loss])
            return loss, {"loss": averaged_losses[0]}

        if accelerator.state.megatron_lm_plugin.custom_loss_function is not None:
            return accelerator.state.megatron_lm_plugin.custom_loss_function
        if pretraining_flag:
            return loss_func_pretrain
        else:
            return loss_func_finetune

    def get_forward_step_func(self, pretraining_flag, bert_binary_head):
        def forward_step(data_iterator, model):
            """Forward step."""
            tokens, types, sentence_order, loss_mask, labels, padding_mask = self.get_batch(data_iterator)
            if not bert_binary_head:
                types = None
            # Forward pass through the model.
            if pretraining_flag:
                output_tensor = model(tokens, padding_mask, tokentype_ids=types, lm_labels=labels)
                return output_tensor, partial(self.loss_func, loss_mask, sentence_order)
            else:
                logits = model(tokens, padding_mask, tokentype_ids=types)
                return logits, partial(self.loss_func, labels)

        return forward_step


class GPTTrainStep(AbstractTrainStep):
    """
    GPT train step class.

    Args:
        args (`argparse.Namespace`): Megatron-LM arguments.
    """

    def __init__(self, accelerator, args):
        super().__init__("GPTTrainStep")
        self.get_batch = self.get_batch_func(accelerator, args.megatron_dataset_flag)
        self.loss_func = self.get_loss_func(accelerator)
        self.forward_step = self.get_forward_step_func()
        if args.vocab_file is not None:
            tokenizer = get_tokenizer()
            self.eod_token = tokenizer.eod
        self.eod_token = args.eos_token_id
        self.pad_token = args.eos_token_id
        self.reset_position_ids = args.reset_position_ids
        self.reset_attention_mask = args.reset_attention_mask
        self.eod_mask_loss = args.eod_mask_loss
        if not args.model_return_dict:
            self.model_output_class = None
        else:
            from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions

            self.model_output_class = CausalLMOutputWithCrossAttentions

    def get_batch_func(self, accelerator, megatron_dataset_flag):
        def get_batch_megatron(data_iterator):
            """Generate a batch"""
            # Items and their type.
            keys = ["text"]
            datatype = torch.int64

            # Broadcast data.
            if data_iterator is not None:
                data = next(data_iterator)
            else:
                data = None
            data_b = tensor_parallel.broadcast_data(keys, data, datatype)

            # Unpack.
            tokens_ = data_b["text"].long()
            labels = tokens_[:, 1:].contiguous()
            tokens = tokens_[:, :-1].contiguous()

            # Get the masks and position ids.
            attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
                tokens,
                eod_token=self.eod_token,
                pad_token=self.eod_token,
                reset_position_ids=self.reset_position_ids,
                reset_attention_mask=self.reset_attention_mask,
                eod_mask_loss=self.eod_mask_loss,
                pad_mask_loss=True,
            )
            return tokens, labels, loss_mask, attention_mask, position_ids

        def get_batch_transformer(data_iterator):
            data = next(data_iterator)
            data = {"input_ids": data["input_ids"]}
            data = send_to_device(data, torch.cuda.current_device())

            tokens_ = data["input_ids"].long()
            padding = torch.zeros((tokens_.shape[0], 1), dtype=tokens_.dtype, device=tokens_.device) + self.eod_token
            tokens_ = torch.concat([tokens_, padding], dim=1)
            labels = tokens_[:, 1:].contiguous()
            tokens = tokens_[:, :-1].contiguous()
            # Get the masks and position ids.
            attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
                tokens,
                eod_token=self.eod_token,
                pad_token=self.eod_token,
                reset_position_ids=self.reset_position_ids,
                reset_attention_mask=self.reset_attention_mask,
                eod_mask_loss=self.eod_mask_loss,
                pad_mask_loss=True,
            )
            return tokens, labels, loss_mask, attention_mask, position_ids

        if accelerator.state.megatron_lm_plugin.custom_get_batch_function is not None:
            return accelerator.state.megatron_lm_plugin.custom_get_batch_function
        if megatron_dataset_flag:
            try:
                # Use '--no-use-pep517 -e' to pip install nvidia's megatron from source
                from pretrain_gpt import get_batch

                return get_batch
            except ImportError:
                pass
            return get_batch_megatron
        else:
            return get_batch_transformer

    def get_loss_func(self, accelerator):
        args = get_args()

        def loss_func(loss_mask, output_tensor):
            if args.return_logits:
                losses, logits = output_tensor
            else:
                losses = output_tensor
            losses = losses.float()
            loss_mask = loss_mask.view(-1).float()
            if args.context_parallel_size > 1:
                loss = torch.cat([torch.sum(losses.view(-1) * loss_mask).view(1), loss_mask.sum().view(1)])
                torch.distributed.all_reduce(loss, group=mpu.get_context_parallel_group())
                loss = loss[0] / loss[1]
            else:
                loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()

            # Check individual rank losses are not NaN prior to DP all-reduce.
            if args.check_for_nan_in_loss_and_grad:
                global_rank = torch.distributed.get_rank()
                assert not loss.isnan(), (
                    f"Rank {global_rank}: found NaN in local forward loss calculation. "
                    f"Device: {torch.cuda.current_device()}, node: {os.uname()[1]}"
                )

            # Reduce loss for logging.
            averaged_loss = average_losses_across_data_parallel_group([loss])

            output_dict = {"lm loss": averaged_loss[0]}
            if args.return_logits:
                output_dict.update({"logits": logits})
            return loss, output_dict

        if accelerator.state.megatron_lm_plugin.custom_loss_function is not None:
            return accelerator.state.megatron_lm_plugin.custom_loss_function
        return loss_func

    def get_forward_step_func(self):
        def forward_step(data_iterator, model):
            """Forward step."""
            # Get the batch.
            tokens, labels, loss_mask, attention_mask, position_ids = self.get_batch(data_iterator)
            output_tensor = model(tokens, position_ids, attention_mask, labels=labels)

            return output_tensor, partial(self.loss_func, loss_mask)

        return forward_step


class T5TrainStep(AbstractTrainStep):
    """
    T5 train step class.

    Args:
        args (`argparse.Namespace`): Megatron-LM arguments.
    """

    def __init__(self, accelerator, args):
        super().__init__("T5TrainStep")
        self.get_batch = self.get_batch_func(accelerator, args.megatron_dataset_flag)
        self.loss_func = self.get_loss_func(accelerator)
        self.forward_step = self.get_forward_step_func()
        if not args.model_return_dict:
            self.model_output_class = None
        else:
            from transformers.modeling_outputs import Seq2SeqLMOutput

            self.model_output_class = Seq2SeqLMOutput

    @staticmethod
    def attn_mask_postprocess(attention_mask):
        # We create a 3D attention mask from a 2D tensor mask.
        # [b, 1, s]
        attention_mask_b1s = attention_mask.unsqueeze(1)
        # [b, s, 1]
        attention_mask_bs1 = attention_mask.unsqueeze(2)
        # [b, s, s]
        attention_mask_bss = attention_mask_b1s * attention_mask_bs1
        # Convert attention mask to binary:
        extended_attention_mask = attention_mask_bss < 0.5
        return extended_attention_mask

    @staticmethod
    def get_decoder_mask(seq_length, device):
        attention_mask = torch.tril(torch.ones((1, seq_length, seq_length), device=device))
        attention_mask = attention_mask < 0.5
        return attention_mask

    @staticmethod
    def get_enc_dec_mask(attention_mask, dec_seq_length, device):
        batch_size, _ = attention_mask.shape
        # We create a 3D attention mask from a 2D tensor mask.
        # [b, 1, s]
        attention_mask_b1s = attention_mask.unsqueeze(1)
        # [b, s, 1]
        attention_mask_bs1 = torch.ones((batch_size, dec_seq_length, 1), device=device)
        attention_mask_bss = attention_mask_bs1 * attention_mask_b1s
        extended_attention_mask = attention_mask_bss < 0.5
        return extended_attention_mask

    def get_batch_func(self, accelerator, megatron_dataset_flag):
        def get_batch_megatron(data_iterator):
            """Build the batch."""

            keys = ["text_enc", "text_dec", "labels", "loss_mask", "enc_mask", "dec_mask", "enc_dec_mask"]
            datatype = torch.int64

            # Broadcast data.
            if data_iterator is not None:
                data = next(data_iterator)
            else:
                data = None
            data_b = tensor_parallel.broadcast_data(keys, data, datatype)

            # Unpack.
            tokens_enc = data_b["text_enc"].long()
            tokens_dec = data_b["text_dec"].long()
            labels = data_b["labels"].long()
            loss_mask = data_b["loss_mask"].float()

            enc_mask = data_b["enc_mask"] < 0.5
            dec_mask = data_b["dec_mask"] < 0.5
            enc_dec_mask = data_b["enc_dec_mask"] < 0.5

            return tokens_enc, tokens_dec, loss_mask, labels, enc_mask, dec_mask, enc_dec_mask

        def get_batch_transformer(data_iterator):
            """Build the batch."""
            data = next(data_iterator)
            data = send_to_device(data, torch.cuda.current_device())

            tokens_enc = data["input_ids"].long()
            labels = data["labels"].long()
            loss_mask = (labels != -100).to(torch.float)
            if "decoder_input_ids" in data:
                tokens_dec = data["decoder_input_ids"].long()
            else:
                tokens_dec = labels.new_zeros(labels.shape, device=labels.device, dtype=torch.long)
                tokens_dec[..., 1:] = labels[..., :-1].clone()
                tokens_dec[..., 0] = 0
                tokens_dec.masked_fill_(tokens_dec == -100, 0)
            enc_mask = T5TrainStep.attn_mask_postprocess(data["attention_mask"].long())
            dec_mask = T5TrainStep.get_decoder_mask(tokens_dec.shape[1], tokens_dec.device)
            enc_dec_mask = T5TrainStep.get_enc_dec_mask(
                data["attention_mask"].long(), tokens_dec.shape[1], tokens_dec.device
            )

            return tokens_enc, tokens_dec, loss_mask, labels, enc_mask, dec_mask, enc_dec_mask

        if accelerator.state.megatron_lm_plugin.custom_get_batch_function is not None:
            return accelerator.state.megatron_lm_plugin.custom_get_batch_function
        if megatron_dataset_flag:
            try:
                # Use '--no-use-pep517 -e' to pip install nvidia's megatron from source
                from pretrain_t5 import get_batch

                return get_batch
            except ImportError:
                pass
            return get_batch_megatron
        else:
            return get_batch_transformer

    def get_loss_func(self, accelerator):
        def loss_func(loss_mask, output_tensor):
            lm_loss_ = output_tensor.float()
            lm_loss = torch.sum(lm_loss_.view(-1) * loss_mask.reshape(-1)) / loss_mask.sum()

            loss = lm_loss
            averaged_losses = average_losses_across_data_parallel_group([lm_loss])

            return loss, {"lm loss": averaged_losses[0]}

        if accelerator.state.megatron_lm_plugin.custom_loss_function is not None:
            return accelerator.state.megatron_lm_plugin.custom_loss_function
        return loss_func

    def get_forward_step_func(self):
        def forward_step(data_iterator, model):
            """Forward step."""
            # Get the batch.
            tokens_enc, tokens_dec, loss_mask, lm_labels, enc_mask, dec_mask, enc_dec_mask = self.get_batch(
                data_iterator
            )
            # Forward model lm_labels
            output_tensor = model(
                tokens_enc, tokens_dec, enc_mask, dec_mask, enc_dec_mask, tokentype_ids=None, lm_labels=lm_labels
            )

            return output_tensor, partial(self.loss_func, loss_mask)

        return forward_step


def finish_mpu_init():
    # torch.distributed initialization
    args = get_args()
    # Pytorch distributed.
    _initialize_distributed(None, None, None)

    # Random seeds for reproducibility.
    if args.rank == 0:
        print(f"> setting random seeds to {args.seed} ...")
    _set_random_seed(args.seed, args.data_parallel_random_init)


# initialize megatron setup
def initialize(accelerator, extra_args_provider=None, args_defaults=None):
    if args_defaults is None:
        args_defaults = {}
    accelerator.print("Initializing Megatron-LM")
    assert torch.cuda.is_available(), "Megatron requires CUDA."

    # Parse arguments
    args = parse_args(extra_args_provider, ignore_unknown_args=True)

    # Set defaults
    for key, value in args_defaults.items():
        if getattr(args, key, None) is not None:
            if args.rank == 0:
                print(
                    f"WARNING: overriding default arguments for {key}:{getattr(args, key)} with {key}:{value}",
                    flush=True,
                )
        setattr(args, key, value)

    if args.use_checkpoint_args or args_defaults.get("use_checkpoint_args", False):
        assert args.load is not None, "--use-checkpoints-args requires --load argument"
        load_args_from_checkpoint(args)

    validate_args(args)

    # set global args, build tokenizer, and set adlr-autoresume,
    # tensorboard-writer, and timers.
    set_global_variables(args, build_tokenizer=False)

    # Megatron's MPU is the master. Complete initialization right away.
    finish_mpu_init()

    # Autoresume.
    _init_autoresume()

    # Compile dependencies.
    _compile_dependencies()

    # Set pytorch JIT layer fusion options and warmup JIT functions.
    set_jit_fusion_options()
    args = get_args()
    if getattr(args, "padded_vocab_size", None) is None:
        args.padded_vocab_size = _vocab_size_with_padding(args.orig_vocab_size, args)
    if args.model_type_name == "bert" and args.pretraining_flag and args.num_labels == 2:
        args.bert_binary_head = True
    else:
        args.bert_binary_head = False
    args.iteration = 0


class MegatronEngine(torch.nn.Module):
    """
    Megatron-LM model wrapper

    Args:
        accelerator (:class:`~accelerate.Accelerator`): The accelerator object to use.
        model: Megatron-LM model
        optimizer: Megatron-LM optimizer
        lr_scheduler: Megatron-LM lr scheduler
    """

    def __init__(self, accelerator, model, optimizer, scheduler):
        super().__init__()
        self.module = model
        self.base_model = model[0]
        self.optimizer = optimizer
        self.scheduler = scheduler
        args = get_args()
        if accelerator.state.megatron_lm_plugin.custom_train_step_class is not None:
            self.train_step_handler = accelerator.state.megatron_lm_plugin.custom_train_step_class(
                args, **accelerator.state.megatron_lm_plugin.custom_train_step_kwargs
            )
        elif args.model_type_name == "bert":
            self.train_step_handler = BertTrainStep(accelerator, args)
        elif args.model_type_name == "gpt":
            self.train_step_handler = GPTTrainStep(accelerator, args)
        elif args.model_type_name == "t5":
            self.train_step_handler = T5TrainStep(accelerator, args)
        else:
            raise ValueError(f"Unsupported model type: {args.model_type_name}")
        self.optimizer.skipped_iter = False

        # Tracking loss.
        self.total_loss_dict = {}
        self.eval_total_loss_dict = {}
        self.iteration = 0
        self.report_memory_flag = True
        self.num_floating_point_operations_so_far = 0
        self.module_config = None
        if args.tensorboard_dir is not None:
            write_args_to_tensorboard()

    def get_module_config(self):
        args = get_args()
        config = get_model_config(self.module[0])
        # Setup some training config params
        config.grad_scale_func = self.optimizer.scale_loss
        if isinstance(self.module[0], LocalDDP) and args.overlap_grad_reduce:
            assert config.no_sync_func is None, (
                "When overlap_grad_reduce is True, config.no_sync_func must be None; "
                "a custom no_sync_func is not supported when overlapping grad-reduce"
            )
            config.no_sync_func = [model_chunk.no_sync for model_chunk in self.module]
            if len(self.module) == 1:
                config.no_sync_func = config.no_sync_func[0]
            if args.delay_grad_reduce:
                config.grad_sync_func = [model_chunk.start_grad_sync for model_chunk in self.module]
                if len(self.module) == 1:
                    config.grad_sync_func = config.grad_sync_func[0]
        if args.overlap_param_gather and args.delay_param_gather:
            config.param_sync_func = [
                lambda x: self.optimizer.finish_param_sync(model_index, x) for model_index in range(len(self.module))
            ]
            if len(self.module) == 1:
                config.param_sync_func = config.param_sync_func[0]
        config.finalize_model_grads_func = finalize_model_grads
        return config

    def train(self):
        for model_module in self.module:
            model_module.train()

        if self.module_config is None:
            self.module_config = self.get_module_config()

        self.log_eval_results()

    def eval(self):
        for model_module in self.module:
            model_module.eval()

        if self.module_config is None:
            self.module_config = self.get_module_config()

    def get_batch_data_iterator(self, batch_data):
        args = get_args()
        data_chunks = []
        if len(batch_data) > 0:
            if args.num_micro_batches > 1:
                for i in range(0, args.num_micro_batches):
                    data_chunks.append(
                        {
                            k: v[i * args.micro_batch_size : (i + 1) * args.micro_batch_size]
                            for k, v in batch_data.items()
                        }
                    )
            else:
                data_chunks = [batch_data]

        if len(self.module) > 1:
            batch_data_iterator = (
                [iter(data_chunks) for _ in range(len(self.module))]
                if len(batch_data) > 0
                else [None] * len(self.module)
            )
        else:
            batch_data_iterator = iter(data_chunks) if len(batch_data) > 0 else None
        return batch_data_iterator

    def train_step(self, **batch_data):
        """
        Training step for Megatron-LM

        Args:
            batch_data (:obj:`dict`): The batch data to train on.
        """

        batch_data_iterator = self.get_batch_data_iterator(batch_data)

        loss_reduced, skipped_iter, _, _, _, grad_norm, num_zeros_in_grad = train_step(
            forward_step_func=self.train_step_handler.forward_step,
            data_iterator=batch_data_iterator,
            model=self.module,
            optimizer=self.optimizer,
            opt_param_scheduler=self.scheduler,
            config=self.module_config,
            forward_backward_func=get_forward_backward_func(),
        )

        self.optimizer.skipped_iter = skipped_iter == 1

        return loss_reduced, skipped_iter, grad_norm, num_zeros_in_grad

    def eval_step(self, **batch_data):
        """
        Evaluation step for Megatron-LM

        Args:
            batch_data (:obj:`dict`): The batch data to evaluate on.
        """

        args = get_args()
        batch_data_iterator = self.get_batch_data_iterator(batch_data)
        forward_backward_func = get_forward_backward_func()
        loss_dicts = forward_backward_func(
            forward_step_func=self.train_step_handler.forward_step,
            data_iterator=batch_data_iterator,
            model=self.module,
            num_microbatches=get_num_microbatches(),
            seq_length=args.seq_length,
            micro_batch_size=args.micro_batch_size,
            forward_only=True,
        )
        # Empty unused memory
        if args.empty_unused_memory_level >= 1:
            torch.cuda.empty_cache()

        args.consumed_valid_samples += (
            mpu.get_data_parallel_world_size() * args.micro_batch_size * get_num_microbatches()
        )

        if mpu.is_pipeline_last_stage(ignore_virtual=True):
            # Average loss across microbatches.
            loss_reduced = {}
            for key in loss_dicts[0]:
                losses_reduced_for_key = [x[key] for x in loss_dicts]
                if len(losses_reduced_for_key[0].shape) == 0:
                    loss_reduced[key] = sum(losses_reduced_for_key) / len(losses_reduced_for_key)
                else:
                    loss_reduced[key] = torch.concat(losses_reduced_for_key)
            return loss_reduced
        return {}

    def forward(self, **batch_data):
        # During training, we use train_step()
        # model(**batch_data) performs following operations by delegating it to `self.train_step`:
        # 1. Prepare **batch_data for Tendor, Pipeline and Model Parallelism
        # 2. Set grad to zero.
        # 3. forward pass and backward pass using Pipeline Parallelism
        # 4. Empty unused memory.
        # 5. Reduce gradients.
        # 6. Update parameters.
        # 7. Gather params when using Distributed Optimizer (Data Parallelism).
        # 8. Update learning rate if scheduler is specified.
        # 9. Empty unused memory.
        # 10. Average loss across microbatches and across DP ranks.
        #
        # During evaluation, we use eval_step()
        args = get_args()
        if self.module[0].training:
            loss_dict, skipped_iter, grad_norm, num_zeros_in_grad = self.train_step(**batch_data)
            self.iteration += 1
            batch_size = mpu.get_data_parallel_world_size() * args.micro_batch_size * get_num_microbatches()
            args.consumed_train_samples += batch_size
            self.num_floating_point_operations_so_far += num_floating_point_operations(args, batch_size)
            if args.tensorboard_dir is not None:
                # Logging.
                loss_scale = self.optimizer.get_loss_scale().item()
                params_norm = None
                if args.log_params_norm:
                    params_norm = calc_params_l2_norm(self.model)
                self.report_memory_flag = training_log(
                    loss_dict,
                    self.total_loss_dict,
                    self.optimizer.param_groups[0]["lr"],
                    self.iteration,
                    loss_scale,
                    self.report_memory_flag,
                    skipped_iter,
                    grad_norm,
                    params_norm,
                    num_zeros_in_grad,
                )
        else:
            loss_dict = self.eval_step(**batch_data)
            if args.tensorboard_dir is not None:
                for key in loss_dict:
                    self.eval_total_loss_dict[key] = (
                        self.eval_total_loss_dict.get(key, torch.cuda.FloatTensor([0.0])) + loss_dict[key]
                    )
                    self.eval_total_loss_dict[key + "_num_iters"] = self.eval_total_loss_dict.get(
                        key + "_num_iters", torch.cuda.FloatTensor([0.0])
                    ) + torch.cuda.FloatTensor([1.0])

        loss = torch.tensor(0.0, device=torch.cuda.current_device())
        for key in loss_dict:
            if len(loss_dict[key].shape) == 0:
                loss += loss_dict[key]

        logits = None
        if "logits" in loss_dict:
            logits = loss_dict["logits"]
        if self.train_step_handler.model_output_class is not None:
            return self.train_step_handler.model_output_class(loss=loss, logits=logits)
        return loss

    def log_eval_results(self):
        args = get_args()
        if args.tensorboard_dir is None or self.iteration == 0:
            return
        args = get_args()
        writer = get_tensorboard_writer()
        string = f"validation loss at iteration {self.iteration} | "
        for key in self.eval_total_loss_dict:
            if key.endswith("_num_iters"):
                continue
            value = self.eval_total_loss_dict[key] / self.eval_total_loss_dict[key + "_num_iters"]
            string += f"{key} value: {value} | "
            ppl = math.exp(min(20, value.item()))
            if args.pretraining_flag:
                string += f"{key} PPL: {ppl} | "
            if writer:
                writer.add_scalar(f"{key} validation", value.item(), self.iteration)
                if args.pretraining_flag:
                    writer.add_scalar(f"{key} validation ppl", ppl, self.iteration)

        length = len(string) + 1
        print_rank_last("-" * length)
        print_rank_last(string)
        print_rank_last("-" * length)
        self.eval_total_loss_dict = {}

    def save_checkpoint(self, output_dir):
        self.log_eval_results()
        args = get_args()
        args.save = output_dir
        torch.distributed.barrier()
        save_checkpoint(
            self.iteration,
            self.module,
            self.optimizer,
            self.scheduler,
            num_floating_point_operations_so_far=self.num_floating_point_operations_so_far,
        )
        torch.distributed.barrier()

    def load_checkpoint(self, input_dir):
        args = get_args()
        args.load = input_dir
        args.consumed_train_samples = 0
        args.consumed_valid_samples = 0
        torch.distributed.barrier()
        iteration, num_floating_point_operations_so_far = load_checkpoint(self.module, self.optimizer, self.scheduler)
        torch.distributed.barrier()
        self.iteration = iteration
        self.num_floating_point_operations_so_far = num_floating_point_operations_so_far
        if args.fp16 and self.iteration == 0:
            self.optimizer.reload_model_params()


# other utilities
def avg_losses_across_data_parallel_group(losses):
    """
    Average losses across data parallel group.

    Args:
        losses (List[Tensor]): List of losses to average across data parallel group.
    """

    return average_losses_across_data_parallel_group(losses)


def gather_across_data_parallel_groups(tensor):
    """
    Recursively gather tensor in a nested list/tuple/dictionary of tensors from data parallel ranks.

    Args:
        tensor (nested list/tuple/dictionary of `torch.Tensor`):
            The data to gather across data parallel ranks.

    """

    def _gpu_gather_one(tensor):
        if tensor.ndim == 0:
            tensor = tensor.clone()[None]
        output_tensors = [
            torch.empty_like(tensor)
            for _ in range(torch.distributed.get_world_size(group=mpu.get_data_parallel_group()))
        ]
        torch.distributed.all_gather(output_tensors, tensor, group=mpu.get_data_parallel_group())
        return torch.cat(output_tensors, dim=0)

    return recursively_apply(_gpu_gather_one, tensor, error_on_other_type=True)


================================================
FILE: src/accelerate/utils/memory.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
A collection of utilities for ensuring that training can always occur. Heavily influenced by the
[toma](https://github.com/BlackHC/toma) library.
"""

import functools
import gc
import inspect
from typing import Optional

import torch

from .imports import (
    is_cuda_available,
    is_hpu_available,
    is_mlu_available,
    is_mps_available,
    is_musa_available,
    is_neuron_available,
    is_npu_available,
    is_sdaa_available,
    is_xpu_available,
)


def clear_device_cache(garbage_collection=False):
    """
    Clears the device cache by calling `torch.{backend}.empty_cache`. Can also run `gc.collect()`, but do note that
    this is a *considerable* slowdown and should be used sparingly.
    """
    if garbage_collection:
        gc.collect()

    if is_xpu_available():
        torch.xpu.empty_cache()
    elif is_mlu_available():
        torch.mlu.empty_cache()
    elif is_sdaa_available():
        torch.sdaa.empty_cache()
    elif is_musa_available():
        torch.musa.empty_cache()
    elif is_npu_available():
        torch.npu.empty_cache()
    elif is_mps_available(min_version="2.0"):
        torch.mps.empty_cache()
    elif is_cuda_available():
        torch.cuda.empty_cache()
    elif is_hpu_available():
        # torch.hpu.empty_cache() # not available on hpu as it reserves all device memory for the current process
        pass
    elif is_neuron_available():
        # Not sure it actually does something, but adding for consistency with other backends
        torch.neuron.empty_cache()


def release_memory(*objects):
    """
    Releases memory from `objects` by setting them to `None` and calls `gc.collect()` and `torch.cuda.empty_cache()`.
    Returned objects should be reassigned to the same variables.

    Args:
        objects (`Iterable`):
            An iterable of objects
    Returns:
        A list of `None` objects to replace `objects`

    Example:

        ```python
        >>> import torch
        >>> from accelerate.utils import release_memory

        >>> a = torch.ones(1000, 1000).cuda()
        >>> b = torch.ones(1000, 1000).cuda()
        >>> a, b = release_memory(a, b)
        ```
    """
    if not isinstance(objects, list):
        objects = list(objects)
    for i in range(len(objects)):
        objects[i] = None
    clear_device_cache(garbage_collection=True)
    return objects


def should_reduce_batch_size(exception: Exception) -> bool:
    """
    Checks if `exception` relates to CUDA out-of-memory, XPU out-of-memory, CUDNN not supported, or CPU out-of-memory

    Args:
        exception (`Exception`):
            An exception
    """
    _statements = [
        " out of memory.",  # OOM for CUDA, HIP, XPU
        "cuDNN error: CUDNN_STATUS_NOT_SUPPORTED.",  # CUDNN SNAFU
        "DefaultCPUAllocator: can't allocate memory",  # CPU OOM
        "FATAL ERROR :: MODULE:PT_DEVMEM Allocation failed",  # HPU OOM
    ]
    if isinstance(exception, RuntimeError) and len(exception.args) == 1:
        return any(err in exception.args[0] for err in _statements)
    return False


def find_executable_batch_size(
    function: Optional[callable] = None,
    starting_batch_size: int = 128,
    reduce_batch_size_fn: Optional[callable] = None,
):
    """
    A basic decorator that will try to execute `function`. If it fails from exceptions related to out-of-memory or
    CUDNN, the batch size is multiplied by 0.9 and passed to `function`

    `function` must take in a `batch_size` parameter as its first argument.

    Args:
        function (`callable`, *optional*):
            A function to wrap
        starting_batch_size (`int`, *optional*):
            The batch size to try and fit into memory

    Example:

    ```python
    >>> from accelerate.utils import find_executable_batch_size


    >>> @find_executable_batch_size(starting_batch_size=128)
    ... def train(batch_size, model, optimizer):
    ...     ...


    >>> train(model, optimizer)
    ```
    """
    if function is None:
        return functools.partial(find_executable_batch_size, starting_batch_size=starting_batch_size)

    batch_size = starting_batch_size
    if reduce_batch_size_fn is None:

        def reduce_batch_size_fn():
            nonlocal batch_size
            batch_size = int(batch_size * 0.9)
            return batch_size

    def decorator(*args, **kwargs):
        nonlocal batch_size
        clear_device_cache(garbage_collection=True)
        params = list(inspect.signature(function).parameters.keys())
        # Guard against user error
        if len(params) < (len(args) + 1):
            arg_str = ", ".join([f"{arg}={value}" for arg, value in zip(params[1:], args[1:])])
            raise TypeError(
                f"Batch size was passed into `{function.__name__}` as the first argument when called."
                f"Remove this as the decorator already does so: `{function.__name__}({arg_str})`"
            )
        while True:
            if batch_size == 0:
                raise RuntimeError("No executable batch size found, reached zero.")
            try:
                return function(batch_size, *args, **kwargs)
            except Exception as e:
                if should_reduce_batch_size(e):
                    clear_device_cache(garbage_collection=True)
                    batch_size = reduce_batch_size_fn()
                else:
                    raise

    return decorator


================================================
FILE: src/accelerate/utils/modeling.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import contextlib
import gc
import inspect
import json
import logging
import os
import re
import shutil
import tempfile
import warnings
from collections import OrderedDict, defaultdict
from typing import Optional, Union

import torch
from torch import distributed as dist
from torch import nn

from ..state import AcceleratorState
from .constants import SAFE_WEIGHTS_NAME, WEIGHTS_NAME
from .dataclasses import AutocastKwargs, CustomDtype, DistributedType
from .imports import (
    is_hpu_available,
    is_mlu_available,
    is_mps_available,
    is_musa_available,
    is_npu_available,
    is_peft_available,
    is_sdaa_available,
    is_torch_xla_available,
    is_xpu_available,
)
from .memory import clear_device_cache
from .offload import load_offloaded_weight, offload_weight, save_offload_index
from .tqdm import is_tqdm_available, tqdm
from .versions import is_torch_version


if is_npu_available(check_device=False):
    import torch_npu  # noqa: F401

if is_mlu_available(check_device=False):
    import torch_mlu  # noqa: F401

if is_sdaa_available(check_device=False):
    import torch_sdaa  # noqa: F401

if is_musa_available(check_device=False):
    import torch_musa  # noqa: F401

from safetensors import safe_open
from safetensors.torch import load_file as safe_load_file


WEIGHTS_INDEX_NAME = "pytorch_model.bin.index.json"

logger = logging.getLogger(__name__)


def is_peft_model(model):
    from .other import extract_model_from_parallel

    if is_peft_available():
        from peft import PeftModel

    return is_peft_available() and isinstance(extract_model_from_parallel(model), PeftModel)


def check_device_same(first_device, second_device):
    """
    Utility method to check if two `torch` devices are similar. When dealing torch accelerator devices(e.g. cuda, xpu),
    torch throws `False` for `torch.device("cuda") == torch.device("cuda:0")` whereas they should be the same

    Args:
        first_device (`torch.device`):
            First device to check
        second_device (`torch.device`):
            Second device to check
    """
    if first_device.type != second_device.type:
        return False

    if first_device.type != "cpu" and first_device.index is None:
        # In case the first_device is an torch accelerator device(e.g. cuda, xpu) and have
        # the index attribute set to `None`, default it to `0`
        first_device = torch.device(first_device.type, index=0)

    if second_device.type != "cpu" and second_device.index is None:
        # In case the second_device is an torch accelerator device(e.g. cuda, xpu) and have
        # the index attribute set to `None`, default it to `0`
        second_device = torch.device(second_device.type, index=0)

    return first_device == second_device


def convert_file_size_to_int(size: Union[int, str]):
    """
    Converts a size expressed as a string with digits an unit (like `"5MB"`) to an integer (in bytes).

    Args:
        size (`int` or `str`): The size to convert. Will be directly returned if an `int`.

    Example:

    ```py
    >>> convert_file_size_to_int("1MiB")
    1048576
    ```
    """
    mem_size = -1
    err_msg = (
        f"`size` {size} is not in a valid format. Use an integer for bytes, or a string with an unit (like '5.0GB')."
    )
    try:
        if isinstance(size, int):
            mem_size = size
        elif size.upper().endswith("GIB"):
            mem_size = int(float(size[:-3]) * (2**30))
        elif size.upper().endswith("MIB"):
            mem_size = int(float(size[:-3]) * (2**20))
        elif size.upper().endswith("KIB"):
            mem_size = int(float(size[:-3]) * (2**10))
        elif size.upper().endswith("GB"):
            int_size = int(float(size[:-2]) * (10**9))
            mem_size = int_size // 8 if size.endswith("b") else int_size
        elif size.upper().endswith("MB"):
            int_size = int(float(size[:-2]) * (10**6))
            mem_size = int_size // 8 if size.endswith("b") else int_size
        elif size.upper().endswith("KB"):
            int_size = int(float(size[:-2]) * (10**3))
            mem_size = int_size // 8 if size.endswith("b") else int_size
    except ValueError:
        raise ValueError(err_msg)

    if mem_size < 0:
        raise ValueError(err_msg)
    return mem_size


def dtype_byte_size(dtype: torch.dtype):
    """
    Returns the size (in bytes) occupied by one parameter of type `dtype`.

    Example:

    ```py
    >>> dtype_byte_size(torch.float32)
    4
    ```
    """
    if dtype == torch.bool:
        return 1 / 8
    elif dtype == CustomDtype.INT2:
        return 1 / 4
    elif dtype == CustomDtype.INT4:
        return 1 / 2
    elif dtype == CustomDtype.FP8:
        return 1
    elif is_torch_version(">=", "2.1.0") and dtype in [torch.float8_e4m3fn, torch.float8_e5m2]:
        return 1
    bit_search = re.search(r"[^\d](\d+)$", str(dtype))
    if bit_search is None:
        raise ValueError(f"`dtype` is not a valid dtype: {dtype}.")
    bit_size = int(bit_search.groups()[0])
    return bit_size // 8


def id_tensor_storage(tensor: torch.Tensor) -> tuple[torch.device, int, int]:
    """
    Unique identifier to a tensor storage. Multiple different tensors can share the same underlying storage. For
    example, "meta" tensors all share the same storage, and thus their identifier will all be equal. This identifier is
    guaranteed to be unique and constant for this tensor's storage during its lifetime. Two tensor storages with
    non-overlapping lifetimes may have the same id.
    """
    _SIZE = {
        torch.int64: 8,
        torch.float32: 4,
        torch.int32: 4,
        torch.bfloat16: 2,
        torch.float16: 2,
        torch.int16: 2,
        torch.uint8: 1,
        torch.int8: 1,
        torch.bool: 1,
        torch.float64: 8,
    }
    try:
        storage_ptr = tensor.untyped_storage().data_ptr()
        storage_size = tensor.untyped_storage().nbytes()
    except Exception:
        try:
            # Fallback for torch==1.10
            storage_ptr = tensor.storage().data_ptr()
            storage_size = tensor.storage().size() * _SIZE[tensor.dtype]
        except NotImplementedError:
            # Fallback for meta storage
            storage_ptr = 0
            # On torch >=2.0 this is the tensor size
            storage_size = tensor.nelement() * _SIZE[tensor.dtype]

    return tensor.device, storage_ptr, storage_size


def set_module_tensor_to_device(
    module: nn.Module,
    tensor_name: str,
    device: Union[int, str, torch.device],
    value: Optional[torch.Tensor] = None,
    dtype: Optional[Union[str, torch.dtype]] = None,
    fp16_statistics: Optional[torch.HalfTensor] = None,
    tied_params_map: Optional[dict[int, dict[torch.device, torch.Tensor]]] = None,
    non_blocking: bool = False,
    clear_cache: bool = True,
):
    """
    A helper function to set a given tensor (parameter of buffer) of a module on a specific device (note that doing
    `param.to(device)` creates a new tensor not linked to the parameter, which is why we need this function).

    Args:
        module (`torch.nn.Module`):
            The module in which the tensor we want to move lives.
        tensor_name (`str`):
            The full name of the parameter/buffer.
        device (`int`, `str` or `torch.device`):
            The device on which to set the tensor.
        value (`torch.Tensor`, *optional*):
            The value of the tensor (useful when going from the meta device to any other device).
        dtype (`torch.dtype`, *optional*):
            If passed along the value of the parameter will be cast to this `dtype`. Otherwise, `value` will be cast to
            the dtype of the existing parameter in the model.
        fp16_statistics (`torch.HalfTensor`, *optional*):
            The list of fp16 statistics to set on the module, used for 8 bit model serialization.
        tied_params_map (Dict[int, Dict[torch.device, torch.Tensor]], *optional*, defaults to `None`):
            A map of current data pointers to dictionaries of devices to already dispatched tied weights. For a given
            execution device, this parameter is useful to reuse the first available pointer of a shared weight on the
            device for all others, instead of duplicating memory.
        non_blocking (`bool`, *optional*, defaults to `False`):
            If `True`, the device transfer will be asynchronous with respect to the host, if possible.
        clear_cache (`bool`, *optional*, defaults to `True`):
            Whether or not to clear the device cache after setting the tensor on the device.
    """
    # Recurse if needed
    if "." in tensor_name:
        splits = tensor_name.split(".")
        for split in splits[:-1]:
            new_module = getattr(module, split)
            if new_module is None:
                raise ValueError(f"{module} has no attribute {split}.")
            module = new_module
        tensor_name = splits[-1]

    if tensor_name not in module._parameters and tensor_name not in module._buffers:
        raise ValueError(f"{module} does not have a parameter or a buffer named {tensor_name}.")
    is_buffer = tensor_name in module._buffers
    old_value = getattr(module, tensor_name)

    # Treat the case where old_value (or a custom `value`, typically offloaded to RAM/disk) belongs to a tied group, and one of the weight
    # in the tied group has already been dispatched to the device, by avoiding reallocating memory on the device and just copying the pointer.
    if (
        value is not None
        and tied_params_map is not None
        and value.data_ptr() in tied_params_map
        and device in tied_params_map[value.data_ptr()]
    ):
        module._parameters[tensor_name] = tied_params_map[value.data_ptr()][device]
        return
    elif (
        tied_params_map is not None
        and old_value.data_ptr() in tied_params_map
        and device in tied_params_map[old_value.data_ptr()]
    ):
        module._parameters[tensor_name] = tied_params_map[old_value.data_ptr()][device]
        return

    if old_value.device == torch.device("meta") and device not in ["meta", torch.device("meta")] and value is None:
        raise ValueError(f"{tensor_name} is on the meta device, we need a `value` to put in on {device}.")

    param = module._parameters[tensor_name] if tensor_name in module._parameters else None
    param_cls = type(param)

    if value is not None:
        # We can expect mismatches when using bnb 4bit since Params4bit will reshape and pack the weights.
        # In other cases, we want to make sure we're not loading checkpoints that do not match the config.
        if old_value.shape != value.shape and param_cls.__name__ != "Params4bit":
            raise ValueError(
                f'Trying to set a tensor of shape {value.shape} in "{tensor_name}" (which has shape {old_value.shape}), this looks incorrect.'
            )

        if dtype is None:
            # For compatibility with PyTorch load_state_dict which converts state dict dtype to existing dtype in model
            value = value.to(old_value.dtype, non_blocking=non_blocking)
        elif not str(value.dtype).startswith(("torch.uint", "torch.int", "torch.bool")):
            value = value.to(dtype, non_blocking=non_blocking)

    device_quantization = None
    with torch.no_grad():
        # leave it on cpu first before moving them to device
        # # fix the case where the device is meta, we don't want to put it on cpu because there is no data =0
        if (
            param is not None
            and param.device.type not in ("cuda", "xpu")
            and torch.device(device).type in ("cuda", "xpu")
            and param_cls.__name__ in ["Int8Params", "FP4Params", "Params4bit"]
        ):
            device_quantization = device
            device = "cpu"
        # `torch.Tensor.to(<int num>)` is not supported by `torch_npu` (see this [issue](https://github.com/Ascend/pytorch/issues/16)).
        if isinstance(device, int):
            if is_npu_available():
                device = f"npu:{device}"
            elif is_mlu_available():
                device = f"mlu:{device}"
            elif is_sdaa_available():
                device = f"sdaa:{device}"
            elif is_musa_available():
                device = f"musa:{device}"
            elif is_hpu_available():
                device = "hpu"
        if "xpu" in str(device) and not is_xpu_available():
            raise ValueError(f'{device} is not available, you should use device="cpu" instead')
        if value is None:
            new_value = old_value.to(device, non_blocking=non_blocking)
            if dtype is not None and device in ["meta", torch.device("meta")]:
                if not str(old_value.dtype).startswith(("torch.uint", "torch.int", "torch.bool")):
                    new_value = new_value.to(dtype, non_blocking=non_blocking)

                if not is_buffer:
                    module._parameters[tensor_name] = param_cls(new_value, requires_grad=old_value.requires_grad)
        elif isinstance(value, torch.Tensor):
            new_value = value.to(device, non_blocking=non_blocking)
        else:
            new_value = torch.tensor(value, device=device)
        if device_quantization is not None:
            device = device_quantization
        if is_buffer:
            module._buffers[tensor_name] = new_value
        elif value is not None or not check_device_same(torch.device(device), module._parameters[tensor_name].device):
            param_cls = type(module._parameters[tensor_name])
            kwargs = module._parameters[tensor_name].__dict__
            is_hf_initialized = kwargs.pop("_is_hf_initialized", None)
            if param_cls.__name__ in ["Int8Params", "FP4Params", "Params4bit"]:
                if param_cls.__name__ == "Int8Params" and new_value.dtype == torch.float32:
                    # downcast to fp16 if any - needed for 8bit serialization
                    new_value = new_value.to(torch.float16, non_blocking=non_blocking)
                # quantize module that are going to stay on the cpu so that we offload quantized weights
                if device == "cpu" and param_cls.__name__ == "Int8Params":
                    new_value = param_cls(new_value, requires_grad=old_value.requires_grad, **kwargs).to(0).to("cpu")
                    new_value.CB = new_value.CB.to("cpu")
                    new_value.SCB = new_value.SCB.to("cpu")
                else:
                    new_value = param_cls(new_value, requires_grad=old_value.requires_grad, **kwargs).to(
                        device, non_blocking=non_blocking
                    )
            elif param_cls.__name__ in ["QTensor", "QBitsTensor"]:
                new_value = torch.nn.Parameter(new_value, requires_grad=old_value.requires_grad).to(
                    device, non_blocking=non_blocking
                )
            elif param_cls.__name__ in ["AffineQuantizedTensor"] or "torchao" in getattr(param_cls, "__module__", ""):
                new_value = new_value.to(device, non_blocking=non_blocking)
            else:
                new_value = param_cls(new_value, requires_grad=old_value.requires_grad).to(
                    device, non_blocking=non_blocking
                )

            if is_hf_initialized is not None:
                new_value._is_hf_initialized = is_hf_initialized
            module._parameters[tensor_name] = new_value
            if fp16_statistics is not None:
                module._parameters[tensor_name].SCB = fp16_statistics.to(device, non_blocking=non_blocking)
                del fp16_statistics
            # as we put the weight to meta, it doesn't have SCB attr anymore. make sure that it is not a meta weight
            if (
                module.__class__.__name__ == "Linear8bitLt"
                and getattr(module.weight, "SCB", None) is None
                and str(module.weight.device) != "meta"
            ):
                # quantize only if necessary
                device_index = torch.device(device).index if torch.device(device).type in ["cuda", "xpu"] else None
                if not getattr(module.weight, "SCB", None) and device_index is not None:
                    if module.bias is not None and module.bias.device.type != "meta":
                        # if a bias exists, we need to wait until the bias is set on the correct device
                        module = module.to(device_index)
                    elif module.bias is None:
                        # if no bias exists, we can quantize right away
                        module = module.to(device_index)
            elif (
                module.__class__.__name__ == "Linear4bit"
                and getattr(module.weight, "quant_state", None) is None
                and str(module.weight.device) != "meta"
            ):
                # quantize only if necessary
                device_index = torch.device(device).index if torch.device(device).type in ["cuda", "xpu"] else None
                if not getattr(module.weight, "quant_state", None) and device_index is not None:
                    module.weight = module.weight.to(device_index)

    # clean pre and post forward hook
    if clear_cache and device not in ("cpu", "meta"):
        clear_device_cache()

    # When handling tied weights, we update tied_params_map to keep track of the tied weights that have already been allocated on the device in
    # order to avoid duplicating memory, see above.
    if (
        tied_params_map is not None
        and old_value.data_ptr() in tied_params_map
        and device not in tied_params_map[old_value.data_ptr()]
    ):
        tied_params_map[old_value.data_ptr()][device] = new_value
    elif (
        value is not None
        and tied_params_map is not None
        and value.data_ptr() in tied_params_map
        and device not in tied_params_map[value.data_ptr()]
    ):
        tied_params_map[value.data_ptr()][device] = new_value


def named_module_tensors(
    module: nn.Module, include_buffers: bool = True, recurse: bool = False, remove_non_persistent: bool = False
):
    """
    A helper function that gathers all the tensors (parameters + buffers) of a given module. If `include_buffers=True`
    it's the same as doing `module.named_parameters(recurse=recurse) + module.named_buffers(recurse=recurse)`.

    Args:
        module (`torch.nn.Module`):
            The module we want the tensors on.
        include_buffer (`bool`, *optional*, defaults to `True`):
            Whether or not to include the buffers in the result.
        recurse (`bool`, *optional`, defaults to `False`):
            Whether or not to go look in every submodule or just return the direct parameters and buffers.
        remove_non_persistent (`bool`, *optional*, defaults to `False`):
            Whether or not to remove the non persistent buffer from the buffers. Useful only when include_buffers =
            True
    """
    yield from module.named_parameters(recurse=recurse)

    if include_buffers:
        non_persistent_buffers = set()
        if remove_non_persistent:
            non_persistent_buffers = get_non_persistent_buffers(module, recurse=recurse)
        for named_buffer in module.named_buffers(recurse=recurse):
            name, _ = named_buffer
            if name not in non_persistent_buffers:
                yield named_buffer


def get_non_persistent_buffers(module: nn.Module, recurse: bool = False, fqns: bool = False):
    """
    Gather all non persistent buffers of a given modules into a set

    Args:
        module (`nn.Module`):
            The module we want the non persistent buffers on.
        recurse (`bool`, *optional*, defaults to `False`):
            Whether or not to go look in every submodule or just return the direct non persistent buffers.
        fqns (`bool`, *optional*, defaults to `False`):
            Whether or not to return the fully-qualified names of the non persistent buffers.
    """

    non_persistent_buffers_set = module._non_persistent_buffers_set
    if recurse:
        for n, m in module.named_modules():
            if fqns:
                non_persistent_buffers_set |= {n + "." + b for b in m._non_persistent_buffers_set}
            else:
                non_persistent_buffers_set |= m._non_persistent_buffers_set

    return non_persistent_buffers_set


def check_tied_parameters_in_config(model: nn.Module):
    """
    Check if there is any indication in the given model that some weights should be tied.

    Args:
        model (`torch.nn.Module`): The model to inspect

    Returns:
        bool: True if the model needs to have tied weights
    """

    # based on model.tie_weights() method
    has_tied_word_embedding = False
    has_tied_encoder_decoder = False
    has_tied_module = False

    if "PreTrainedModel" in [c.__name__ for c in inspect.getmro(model.__class__)]:
        has_tied_word_embedding = False
        model_decoder_config = None
        if hasattr(model, "config"):
            model_decoder_config = (
                model.config.get_text_config(decoder=True)
                if hasattr(model.config, "get_text_config")
                else model.config
            )
        has_tied_word_embedding = (
            model_decoder_config is not None
            and getattr(model_decoder_config, "tie_word_embeddings", False)
            and model.get_output_embeddings()
        )

        has_tied_encoder_decoder = (
            hasattr(model, "config")
            and getattr(model.config, "is_encoder_decoder", False)
            and getattr(model.config, "tie_encoder_decoder", False)
        )
        has_tied_module = any(hasattr(module, "_tie_weights") for module in model.modules())
    return any([has_tied_word_embedding, has_tied_encoder_decoder, has_tied_module])


def _get_param_device(param, device_map):
    if param in device_map:
        return device_map[param]
    parent_param = ".".join(param.split(".")[:-1])
    if parent_param == param:
        raise ValueError(f"The `device_map` does not contain the module {param}.")
    else:
        return _get_param_device(parent_param, device_map)


def check_tied_parameters_on_same_device(tied_params, device_map):
    """
    Check if tied parameters are on the same device

    Args:
        tied_params (`List[List[str]]`):
            A list of lists of parameter names being all tied together.

        device_map (`Dict[str, Union[int, str, torch.device]]`):
            A map that specifies where each submodule should go.

    """
    for tie_param in tied_params:
        tie_param_devices = {}
        for param in tie_param:
            tie_param_devices[param] = _get_param_device(param, device_map)
        if len(set(tie_param_devices.values())) > 1:
            logger.warning(
                f"Tied parameters are on different devices: {tie_param_devices}. "
                "Please modify your custom device map or set `device_map='auto'`. "
            )


def find_tied_parameters(model: torch.nn.Module, **kwargs) -> list[list[str]]:
    """
    Find the tied parameters in a given model.

    <Tip warning={true}>

    The signature accepts keyword arguments, but they are for the recursive part of this function and you should ignore
    them.

    </Tip>

    Args:
        model (`torch.nn.Module`): The model to inspect.

    Returns:
        List[List[str]]: A list of lists of parameter names being all tied together.

    Example:

    ```py
    >>> from collections import OrderedDict
    >>> import torch.nn as nn

    >>> model = nn.Sequential(OrderedDict([("linear1", nn.Linear(4, 4)), ("linear2", nn.Linear(4, 4))]))
    >>> model.linear2.weight = model.linear1.weight
    >>> find_tied_parameters(model)
    [['linear1.weight', 'linear2.weight']]
    ```
    """

    # get ALL model parameters and their names
    all_named_parameters = {name: param for name, param in model.named_parameters(remove_duplicate=False)}

    # get ONLY unique named parameters,
    # if parameter is tied and have multiple names, it will be included only once
    no_duplicate_named_parameters = {name: param for name, param in model.named_parameters(remove_duplicate=True)}

    # the difference of the two sets will give us the tied parameters
    tied_param_names = set(all_named_parameters.keys()) - set(no_duplicate_named_parameters.keys())

    # 'tied_param_names' contains the names of parameters that are tied in the model, but we do not know
    # which names refer to the same parameter. To identify this, we need to group them together.
    tied_param_groups = {}
    for tied_param_name in tied_param_names:
        tied_param = all_named_parameters[tied_param_name]
        for param_name, param in no_duplicate_named_parameters.items():
            # compare if parameters are the same, if so, group their names together
            if param is tied_param:
                if param_name not in tied_param_groups:
                    tied_param_groups[param_name] = []
                tied_param_groups[param_name].append(tied_param_name)

    return [sorted([weight] + list(set(tied))) for weight, tied in tied_param_groups.items()]


def retie_parameters(model, tied_params):
    """
    Reties tied parameters in a given model if the link was broken (for instance when adding hooks).

    Args:
        model (`torch.nn.Module`):
            The model in which to retie parameters.
        tied_params (`List[List[str]]`):
            A mapping parameter name to tied parameter name as obtained by `find_tied_parameters`.
    """
    for tied_group in tied_params:
        param_to_tie = None
        # two loops : the first one to set param_to_tie , the second one to change the values of tied_group
        for param_name in tied_group:
            module = model
            splits = param_name.split(".")
            for split in splits[:-1]:
                module = getattr(module, split)
            param = getattr(module, splits[-1])
            if param_to_tie is None and param.device != torch.device("meta"):
                param_to_tie = param
                break
        if param_to_tie is not None:
            for param_name in tied_group:
                module = model
                splits = param_name.split(".")
                for split in splits[:-1]:
                    module = getattr(module, split)
                setattr(module, splits[-1], param_to_tie)


def _get_proper_dtype(dtype: Union[str, torch.device]) -> torch.dtype:
    """
    Just does torch.dtype(dtype) if necessary.
    """
    if isinstance(dtype, str):
        # We accept "torch.float16" or just "float16"
        dtype = dtype.replace("torch.", "")
        dtype = getattr(torch, dtype)
    return dtype


def compute_module_sizes(
    model: nn.Module,
    dtype: Optional[Union[str, torch.device]] = None,
    special_dtypes: Optional[dict[str, Union[str, torch.device]]] = None,
    buffers_only: bool = False,
):
    """
    Compute the size of each submodule of a given model.
    """
    if dtype is not None:
        dtype = _get_proper_dtype(dtype)
        dtype_size = dtype_byte_size(dtype)
    if special_dtypes is not None:
        special_dtypes = {key: _get_proper_dtype(dtyp) for key, dtyp in special_dtypes.items()}
        special_dtypes_size = {key: dtype_byte_size(dtyp) for key, dtyp in special_dtypes.items()}
    module_sizes = defaultdict(int)

    module_list = []

    if not buffers_only:
        module_list = named_module_tensors(model, recurse=True)
    else:
        module_list = model.named_buffers(recurse=True)

    for name, tensor in module_list:
        if special_dtypes is not None and name in special_dtypes:
            size = tensor.numel() * special_dtypes_size[name]
        elif dtype is None:
            size = tensor.numel() * dtype_byte_size(tensor.dtype)
        elif str(tensor.dtype).startswith(("torch.uint", "torch.int", "torch.bool")):
            # According to the code in set_module_tensor_to_device, these types won't be converted
            # so use their original size here
            size = tensor.numel() * dtype_byte_size(tensor.dtype)
        else:
            size = tensor.numel() * min(dtype_size, dtype_byte_size(tensor.dtype))
        name_parts = name.split(".")
        for idx in range(len(name_parts) + 1):
            module_sizes[".".join(name_parts[:idx])] += size

    return module_sizes


def compute_module_total_buffer_size(
    model: nn.Module,
    dtype: Optional[Union[str, torch.device]] = None,
    special_dtypes: Optional[dict[str, Union[str, torch.device]]] = None,
):
    """
    Compute the total size of buffers in each submodule of a given model.
    """
    module_sizes = compute_module_sizes(model, dtype=dtype, special_dtypes=special_dtypes, buffers_only=True)
    return module_sizes.get("", 0)


def get_max_layer_size(
    modules: list[tuple[str, torch.nn.Module]], module_sizes: dict[str, int], no_split_module_classes: list[str]
):
    """
    Utility function that will scan a list of named modules and return the maximum size used by one full layer. The
    definition of a layer being:
    - a module with no direct children (just parameters and buffers)
    - a module whose class name is in the list `no_split_module_classes`

    Args:
        modules (`List[Tuple[str, torch.nn.Module]]`):
            The list of named modules where we want to determine the maximum layer size.
        module_sizes (`Dict[str, int]`):
            A dictionary mapping each layer name to its size (as generated by `compute_module_sizes`).
        no_split_module_classes (`List[str]`):
            A list of class names for layers we don't want to be split.

    Returns:
        `Tuple[int, List[str]]`: The maximum size of a layer with the list of layer names realizing that maximum size.
    """
    max_size = 0
    layer_names = []
    modules_to_treat = modules.copy()
    while len(modules_to_treat) > 0:
        module_name, module = modules_to_treat.pop(0)
        modules_children = list(module.named_children()) if isinstance(module, torch.nn.Module) else []
        if len(modules_children) == 0 or module.__class__.__name__ in no_split_module_classes:
            # No splitting this one so we compare to the max_size
            size = module_sizes[module_name]
            if size > max_size:
                max_size = size
                layer_names = [module_name]
            elif size == max_size:
                layer_names.append(module_name)
        else:
            modules_to_treat = [(f"{module_name}.{n}", v) for n, v in modules_children] + modules_to_treat
    return max_size, layer_names


def get_max_memory(max_memory: Optional[dict[Union[int, str], Union[int, str]]] = None):
    """
    Get the maximum memory available if nothing is passed, converts string to int otherwise.
    """
    import psutil

    if max_memory is None:
        max_memory = {}
        # Make sure device is initialized on each device to have the right memory info.
        if is_npu_available():
            for i in range(torch.npu.device_count()):
                try:
                    _ = torch.tensor(0, device=torch.device("npu", i))
                    max_memory[i] = torch.npu.mem_get_info(i)[0]
                except Exception:
                    logger.info(f"Device {i} seems unavailable, Proceeding to check subsequent devices.")
                    continue
        elif is_mlu_available():
            for i in range(torch.mlu.device_count()):
                try:
                    _ = torch.tensor(0, device=torch.device("mlu", i))
                    max_memory[i] = torch.mlu.mem_get_info(i)[0]
                except Exception:
                    logger.info(f"Device {i} seems unavailable, Proceeding to check subsequent devices.")
                    continue
        elif is_sdaa_available():
            for i in range(torch.sdaa.device_count()):
                try:
                    _ = torch.tensor(0, device=torch.device("sdaa", i))
                    max_memory[i] = torch.sdaa.mem_get_info(i)[0]
                except Exception:
                    logger.info(f"Device {i} seems unavailable, Proceeding to check subsequent devices.")
                    continue
        elif is_musa_available():
            for i in range(torch.musa.device_count()):
                try:
                    _ = torch.tensor(0, device=torch.device("musa", i))
                    max_memory[i] = torch.musa.mem_get_info(i)[0]
                except Exception:
                    logger.info(f"Device {i} seems unavailable, Proceeding to check subsequent devices.")
                    continue
        elif is_xpu_available():
            for i in range(torch.xpu.device_count()):
                try:
                    _ = torch.tensor(0, device=torch.device("xpu", i))
                    max_memory[i] = torch.xpu.mem_get_info(i)[0]
                except Exception:
                    logger.info(f"Device {i} seems unavailable, Proceeding to check subsequent devices.")
                    continue
        elif is_hpu_available():
            for i in range(torch.hpu.device_count()):
                try:
                    _ = torch.tensor(0, device=torch.device("hpu", i))
                    max_memory[i] = torch.hpu.mem_get_info(i)[0]
                except Exception:
                    logger.info(f"Device {i} seems unavailable, Proceeding to check subsequent devices.")
                    continue
        else:
            for i in range(torch.cuda.device_count()):
                try:
                    _ = torch.tensor([0], device=i)
                    max_memory[i] = torch.cuda.mem_get_info(i)[0]
                except Exception:
                    logger.info(f"Device {i} seems unavailable, Proceeding to check subsequent devices.")
                    continue
        # allocate everything in the mps device as the RAM is shared
        if is_mps_available():
            max_memory["mps"] = psutil.virtual_memory().available
        else:
            max_memory["cpu"] = psutil.virtual_memory().available
        return max_memory

    for key in max_memory:
        if isinstance(max_memory[key], str):
            max_memory[key] = convert_file_size_to_int(max_memory[key])

    # Need to sort the device by type to make sure that we allocate the gpu first.
    # As gpu/npu/xpu are represented by int, we need to sort them first.
    gpu_devices = [k for k in max_memory.keys() if isinstance(k, int)]
    gpu_devices.sort()
    # check if gpu/npu/xpu devices are available and if not, throw a warning
    if is_npu_available():
        num_devices = torch.npu.device_count()
    elif is_mlu_available():
        num_devices = torch.mlu.device_count()
    elif is_sdaa_available():
        num_devices = torch.sdaa.device_count()
    elif is_musa_available():
        num_devices = torch.musa.device_count()
    elif is_xpu_available():
        num_devices = torch.xpu.device_count()
    elif is_hpu_available():
        num_devices = torch.hpu.device_count()
    else:
        num_devices = torch.cuda.device_count()
    for device in gpu_devices:
        if device >= num_devices or device < 0:
            logger.warning(f"Device {device} is not available, available devices are {list(range(num_devices))}")
    # Add the other devices in the preset order if they are available
    all_devices = gpu_devices + [k for k in ["mps", "cpu", "disk"] if k in max_memory.keys()]
    # Raise an error if a device is not recognized
    for k in max_memory.keys():
        if k not in all_devices:
            raise ValueError(
                f"Device {k} is not recognized, available devices are integers(for GPU/XPU), 'mps', 'cpu' and 'disk'"
            )
    max_memory = {k: max_memory[k] for k in all_devices}

    return max_memory


def clean_device_map(device_map: dict[str, Union[int, str, torch.device]], module_name: str = ""):
    """
    Cleans a device_map by grouping all submodules that go on the same device together.
    """
    # Get the value of the current module and if there is only one split across several keys, regroup it.
    prefix = "" if module_name == "" else f"{module_name}."
    values = [v for k, v in device_map.items() if k.startswith(prefix)]
    if len(set(values)) == 1 and len(values) > 1:
        for k in [k for k in device_map if k.startswith(prefix)]:
            del device_map[k]
        device_map[module_name] = values[0]

    # Recurse over the children
    children_modules = [k for k in device_map.keys() if k.startswith(prefix) and len(k) > len(module_name)]
    idx = len(module_name.split(".")) + 1 if len(module_name) > 0 else 1
    children_modules = set(".".join(k.split(".")[:idx]) for k in children_modules)
    for child in children_modules:
        clean_device_map(device_map, module_name=child)

    return device_map


def load_offloaded_weights(model, index, offload_folder):
    """
    Loads the weights from the offload folder into the model.

    Args:
        model (`torch.nn.Module`):
            The model to load the weights into.
        index (`dict`):
            A dictionary containing the parameter name and its metadata for each parameter that was offloaded from the
            model.
        offload_folder (`str`):
            The folder where the offloaded weights are stored.
    """
    if index is None or len(index) == 0:
        # Nothing to do
        return
    for param_name, metadata in index.items():
        if "SCB" in param_name:
            continue
        fp16_statistics = None
        if "weight" in param_name and param_name.replace("weight", "SCB") in index.keys():
            weight_name = param_name.replace("weight", "SCB")
            fp16_statistics = load_offloaded_weight(
                os.path.join(offload_folder, f"{weight_name}.dat"), index[weight_name]
            )
        tensor_file = os.path.join(offload_folder, f"{param_name}.dat")
        weight = load_offloaded_weight(tensor_file, metadata)
        set_module_tensor_to_device(model, param_name, "cpu", value=weight, fp16_statistics=fp16_statistics)


def get_module_leaves(module_sizes):
    module_children = {}
    for module in module_sizes:
        if module == "" or "." not in module:
            continue
        parent = module.rsplit(".", 1)[0]
        module_children[parent] = module_children.get(parent, 0) + 1
    leaves = [module for module in module_sizes if module_children.get(module, 0) == 0 and module != ""]
    return leaves


def get_balanced_memory(
    model: nn.Module,
    max_memory: Optional[dict[Union[int, str], Union[int, str]]] = None,
    no_split_module_classes: Optional[list[str]] = None,
    dtype: Optional[Union[str, torch.dtype]] = None,
    special_dtypes: Optional[dict[str, Union[str, torch.device]]] = None,
    low_zero: bool = False,
):
    """
    Compute a `max_memory` dictionary for [`infer_auto_device_map`] that will balance the use of each available GPU.

    <Tip>

    All computation is done analyzing sizes and dtypes of the model parameters. As a result, the model can be on the
    meta device (as it would if initialized within the `init_empty_weights` context manager).

    </Tip>

    Args:
        model (`torch.nn.Module`):
            The model to analyze.
        max_memory (`Dict`, *optional*):
            A dictionary device identifier to maximum memory. Will default to the maximum memory available if unset.
            Example: `max_memory={0: "1GB"}`.
        no_split_module_classes (`List[str]`, *optional*):
            A list of layer class names that should never be split across device (for instance any layer that has a
            residual connection).
        dtype (`str` or `torch.dtype`, *optional*):
            If provided, the weights will be converted to that type when loaded.
        special_dtypes (`Dict[str, Union[str, torch.device]]`, *optional*):
            If provided, special dtypes to consider for some specific weights (will override dtype used as default for
            all weights).
        low_zero (`bool`, *optional*):
            Minimizes the number of weights on GPU 0, which is convenient when it's used for other operations (like the
            Transformers generate function).
    """
    # Get default / clean up max_memory
    user_not_set_max_memory = max_memory is None
    max_memory = get_max_memory(max_memory)

    if is_npu_available():
        expected_device_type = "npu"
    elif is_mlu_available():
        expected_device_type = "mlu"
    elif is_sdaa_available():
        expected_device_type = "sdaa"
    elif is_musa_available():
        expected_device_type = "musa"
    elif is_xpu_available():
        expected_device_type = "xpu"
    elif is_hpu_available():
        expected_device_type = "hpu"
    elif is_mps_available():
        expected_device_type = "mps"
    else:
        expected_device_type = "cuda"
    num_devices = len([d for d in max_memory if torch.device(d).type == expected_device_type and max_memory[d] > 0])

    if num_devices == 0:
        return max_memory

    if num_devices == 1:
        # We cannot do low_zero on just one GPU, but we will still reserve some memory for the buffer
        low_zero = False
        # If user just asked us to handle memory usage, we should avoid OOM
        if user_not_set_max_memory:
            for key in max_memory.keys():
                if isinstance(key, int):
                    max_memory[key] *= 0.9  # 90% is a good compromise
                    logger.info(
                        f"We will use 90% of the memory on device {key} for storing the model, and 10% for the buffer to avoid OOM. "
                        "You can set `max_memory` in to a higher value to use more memory (at your own risk)."
                    )
                    break  # only one device

    module_sizes = compute_module_sizes(model, dtype=dtype, special_dtypes=special_dtypes)
    per_gpu = module_sizes[""] // (num_devices - 1 if low_zero else num_devices)

    # We can't just set the memory to model_size // num_devices as it will end being too small: each GPU will get
    # slightly less layers and some layers will end up offload at the end. So this function computes a buffer size to
    # add which is the biggest of:
    # - the size of no split block (if applicable)
    # - the mean of the layer sizes
    if no_split_module_classes is None:
        no_split_module_classes = []
    elif not isinstance(no_split_module_classes, (list, tuple)):
        no_split_module_classes = [no_split_module_classes]

    # Identify the size of the no_split_block modules
    if len(no_split_module_classes) > 0:
        no_split_children = {}
        for name, size in module_sizes.items():
            if name == "":
                continue
            submodule = model
            for submodule_name in name.split("."):
                submodule = getattr(submodule, submodule_name)
            class_name = submodule.__class__.__name__
            if class_name in no_split_module_classes and class_name not in no_split_children:
                no_split_children[class_name] = size

            if set(no_split_children.keys()) == set(no_split_module_classes):
                break
        buffer = max(no_split_children.values()) if len(no_split_children) > 0 else 0
    else:
        buffer = 0

    # Compute mean of final modules. In the first dict of module sizes, leaves are the parameters
    leaves = get_module_leaves(module_sizes)
    leaves_set = set(leaves)  # Convert to set for O(1) membership testing
    module_sizes = {n: v for n, v in module_sizes.items() if n not in leaves_set}
    # Once removed, leaves are the final modules.
    leaves = get_module_leaves(module_sizes)
    mean_leaves = int(sum([module_sizes[n] for n in leaves]) / max(len(leaves), 1))
    buffer = int(1.25 * max(buffer, mean_leaves))
    per_gpu += buffer

    # Sorted list of GPUs id (we may have some gpu ids not included in the our max_memory list - let's ignore them)
    gpus_idx_list = list(
        sorted(
            device_id for device_id, device_mem in max_memory.items() if isinstance(device_id, int) and device_mem > 0
        )
    )
    # The last device is left with max_memory just in case the buffer is not enough.
    for idx in gpus_idx_list[:-1]:
        max_memory[idx] = min(max_memory[0] if low_zero and idx == 0 else per_gpu, max_memory[idx])

    if low_zero:
        min_zero = max(0, module_sizes[""] - sum([max_memory[i] for i in range(1, num_devices)]))
        max_memory[0] = min(min_zero, max_memory[0])

    return max_memory


def calculate_maximum_sizes(model: torch.nn.Module):
    "Computes the total size of the model and its largest layer"
    sizes = compute_module_sizes(model)
    # `transformers` models store this information for us
    no_split_modules = getattr(model, "_no_split_modules", None)
    if no_split_modules is None:
        no_split_modules = []

    modules_to_treat = (
        list(model.named_parameters(recurse=False))
        + list(model.named_children())
        + list(model.named_buffers(recurse=False))
    )
    largest_layer = get_max_layer_size(modules_to_treat, sizes, no_split_modules)
    total_size = sizes[""]
    return total_size, largest_layer


def _init_infer_auto_device_map(
    model: nn.Module,
    max_memory: Optional[dict[Union[int, str], Union[int, str]]] = None,
    no_split_module_classes: Optional[list[str]] = None,
    dtype: Optional[Union[str, torch.dtype]] = None,
    special_dtypes: Optional[dict[str, Union[str, torch.device]]] = None,
) -> tuple[
    list[Union[int, str]],
    dict[Union[int, str], Union[int, str]],
    list[Union[int, str]],
    list[int],
    dict[str, int],
    list[list[str]],
    list[str],
    list[tuple[str, nn.Module]],
]:
    """
    Initialize variables required for computing the device map for model allocation.
    """
    max_memory = get_max_memory(max_memory)
    if no_split_module_classes is None:
        no_split_module_classes = []
    elif not isinstance(no_split_module_classes, (list, tuple)):
        no_split_module_classes = [no_split_module_classes]

    devices = list(max_memory.keys())
    if "disk" not in devices:
        devices.append("disk")
    gpus = [device for device in devices if device not in ["cpu", "disk"]]

    # Devices that need to keep space for a potential offloaded layer.
    if "mps" in gpus:
        main_devices = ["mps"]
    elif len(gpus) > 0:
        main_devices = [gpus[0], "cpu"]
    else:
        main_devices = ["cpu"]

    module_sizes = compute_module_sizes(model, dtype=dtype, special_dtypes=special_dtypes)
    tied_parameters = find_tied_parameters(model)
    if check_tied_parameters_in_config(model) and len(tied_parameters) == 0:
        logger.warning(
            "The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function."
        )

    # Direct submodules and parameters
    modules_to_treat = (
        list(model.named_parameters(recurse=False))
        + list(model.named_children())
        + list(model.named_buffers(recurse=False))
    )

    return (
        devices,
        max_memory,
        main_devices,
        gpus,
        module_sizes,
        tied_parameters,
        no_split_module_classes,
        modules_to_treat,
    )


def get_module_size_with_ties(
    tied_params,
    module_size,
    module_sizes,
    modules_to_treat,
) -> tuple[int, list[str], list[nn.Module]]:
    """
    Calculate the total size of a module, including its tied parameters.

    Args:
        tied_params (`List[str]`): The list of tied parameters.
        module_size (`int`): The size of the module without tied parameters.
        module_sizes (`Dict[str, int]`): A dictionary mapping each layer name to its size.
        modules_to_treat (`List[Tuple[str, nn.Module]]`): The list of named modules to treat.

    Returns:
        `Tuple[int, List[str], List[nn.Module]]`: The total size of the module, the names of the tied modules, and the
        tied modules.
    """
    if len(tied_params) < 1:
        return module_size, [], []
    tied_module_names = []
    tied_modules = []

    for tied_param in tied_params:
        tied_module_index = [i for i, (n, _) in enumerate(modules_to_treat) if tied_param.startswith(n + ".")][0]
        tied_module_names.append(modules_to_treat[tied_module_index][0])
        tied_modules.append(modules_to_treat[tied_module_index][1])

    module_size_with_ties = module_size
    for tied_param, tied_module_name in zip(tied_params, tied_module_names):
        module_size_with_ties += module_sizes[tied_module_name] - module_sizes[tied_param]

    return module_size_with_ties, tied_module_names, tied_modules


def fallback_allocate(
    modules: list[tuple[str, nn.Module]],
    module_sizes: dict[str, int],
    size_limit: Union[int, str],
    no_split_module_classes: Optional[list[str]] = None,
    tied_parameters: Optional[list[list[str]]] = None,
) -> tuple[Optional[str], Optional[nn.Module], list[tuple[str, nn.Module]]]:
    """
    Find a module that fits in the size limit using BFS and return it with its name and the remaining modules.

    Args:
        modules (`List[Tuple[str, nn.Module]]`):
            The list of named modules to search in.
        module_sizes (`Dict[str, int]`):
            A dictionary mapping each layer name to its size (as generated by `compute_module_sizes`).
        size_limit (`Union[int, str]`):
            The maximum size a module can have.
        no_split_module_classes (`Optional[List[str]]`, *optional*):
            A list of class names for layers we don't want to be split.
        tied_parameters (`Optional[List[List[str]]`, *optional*):
            A list of lists of parameter names being all tied together.

    Returns:
        `Tuple[Optional[str], Optional[nn.Module], List[Tuple[str, nn.Module]]]`: A tuple containing:
        - The name of the module that fits within the size limit.
        - The module itself.
        - The list of remaining modules after the found module is removed.
    """
    try:
        size_limit = convert_file_size_to_int(size_limit)
    except ValueError:
        return None, None, modules

    if no_split_module_classes is None:
        no_split_module_classes = []

    if tied_parameters is None:
        tied_parameters = []

    modules_to_search = modules.copy()
    module_found = False

    while modules_to_search:
        name, module = modules_to_search.pop(0)

        tied_param_groups = [
            tied_group
            for tied_group in tied_parameters
            if any(name + "." in k + "." for k in tied_group) and not all(name + "." in k + "." for k in tied_group)
        ]

        tied_params = sum(
            [[p for p in tied_group if name + "." not in p + "."] for tied_group in tied_param_groups], []
        )

        module_size_with_ties, _, _ = get_module_size_with_ties(
            tied_params, module_sizes[name], module_sizes, modules_to_search
        )

        # If the module fits in the size limit, we found it.
        if module_size_with_ties <= size_limit:
            module_found = True
            break

        # The module is too big, we need to split it if possible.
        modules_children = (
            []
            if isinstance(module, nn.Parameter) or isinstance(module, torch.Tensor)
            else list(module.named_children())
        )

        # Split fails, move to the next module
        if len(modules_children) == 0 or module.__class__.__name__ in no_split_module_classes:
            continue

        # split is possible, add the children to the list of modules to search
        modules_children = list(module.named_parameters(recurse=False)) + modules_children
        modules_to_search = [(f"{name}.{n}", v) for n, v in modules_children] + modules_to_search

    if not module_found:
        return None, None, modules

    # Prepare the module list for removal of the found module
    current_names = [n for n, _ in modules]
    dot_idx = [i for i, c in enumerate(name) if c == "."]

    for dot_index in dot_idx:
        parent_name = name[:dot_index]
        if parent_name in current_names:
            parent_module_idx = current_names.index(parent_name)
            _, parent_module = modules[parent_module_idx]
            module_children = list(parent_module.named_parameters(recurse=False)) + list(
                parent_module.named_children()
            )
            modules = (
                modules[:parent_module_idx]
                + [(f"{parent_name}.{n}", v) for n, v in module_children]
                + modules[parent_module_idx + 1 :]
            )
            current_names = [n for n, _ in modules]

    # Now the target module should be directly in the list
    target_idx = current_names.index(name)
    name, module = modules.pop(target_idx)

    return name, module, modules


def infer_auto_device_map(
    model: nn.Module,
    max_memory: Optional[dict[Union[int, str], Union[int, str]]] = None,
    no_split_module_classes: Optional[list[str]] = None,
    dtype: Optional[Union[str, torch.dtype]] = None,
    special_dtypes: Optional[dict[str, Union[str, torch.dtype]]] = None,
    verbose: bool = False,
    clean_result: bool = True,
    offload_buffers: bool = False,
    fallback_allocation: bool = False,
):
    """
    Compute a device map for a given model giving priority to GPUs, then offload on CPU and finally offload to disk,
    such that:
    - we don't exceed the memory available of any of the GPU.
    - if offload to the CPU is needed, there is always room left on GPU 0 to put back the layer offloaded on CPU that
      has the largest size.
    - if offload to the CPU is needed,we don't exceed the RAM available on the CPU.
    - if offload to the disk is needed, there is always room left on the CPU to put back the layer offloaded on disk
      that has the largest size.

    <Tip>

    All computation is done analyzing sizes and dtypes of the model parameters. As a result, the model can be on the
    meta device (as it would if initialized within the `init_empty_weights` context manager).

    </Tip>

    Args:
        model (`torch.nn.Module`):
            The model to analyze.
        max_memory (`Dict`, *optional*):
            A dictionary device identifier to maximum memory. Will default to the maximum memory available if unset.
            Example: `max_memory={0: "1GB"}`.
        no_split_module_classes (`List[str]`, *optional*):
            A list of layer class names that should never be split across device (for instance any layer that has a
            residual connection).
        dtype (`str` or `torch.dtype`, *optional*):
            If provided, the weights will be converted to that type when loaded.
        special_dtypes (`Dict[str, Union[str, torch.device]]`, *optional*):
            If provided, special dtypes to consider for some specific weights (will override dtype used as default for
            all weights).
        verbose (`bool`, *optional*, defaults to `False`):
            Whether or not to provide debugging statements as the function builds the device_map.
        clean_result (`bool`, *optional*, defaults to `True`):
            Clean the resulting device_map by grouping all submodules that go on the same device together.
        offload_buffers (`bool`, *optional*, defaults to `False`):
            In the layers that are offloaded on the CPU or the hard drive, whether or not to offload the buffers as
            well as the parameters.
        fallback_allocation (`bool`, *optional*, defaults to `False`):
            When regular allocation fails, try to allocate a module that fits in the size limit using BFS.
    """

    # Initialize the variables
    (
        devices,
        max_memory,
        main_devices,
        gpus,
        module_sizes,
        tied_parameters,
        no_split_module_classes,
        modules_to_treat,
    ) = _init_infer_auto_device_map(model, max_memory, no_split_module_classes, dtype, special_dtypes)

    device_map = OrderedDict()
    current_device = 0
    device_memory_used = {device: 0 for device in devices}
    device_buffer_sizes = {}
    device_minimum_assignment_memory = {}

    # Initialize maximum largest layer, to know which space to keep in memory
    max_layer_size, max_layer_names = get_max_layer_size(modules_to_treat, module_sizes, no_split_module_classes)

    # Ready ? This is going to be a bit messy.
    while len(modules_to_treat) > 0:
        name, module = modules_to_treat.pop(0)
        if verbose:
            print(f"\nTreating module {name}.")
        # Max size in the remaining layers may have changed since we took one, so we maybe update it.
        max_layer_names = [n for n in max_layer_names if n != name and not n.startswith(name + ".")]
        if len(max_layer_names) == 0:
            max_layer_size, max_layer_names = get_max_layer_size(
                [(n, m) for n, m in modules_to_treat if isinstance(m, torch.nn.Module)],
                module_sizes,
                no_split_module_classes,
            )
        # Assess size needed
        module_size = module_sizes[name]

        # We keep relevant tied parameters only: one of the tied parameters in the group is inside the current module
        # and the other is not.
        # Note: If we are currently processing the name `compute.weight`, an other parameter named
        # e.g. `compute.weight_submodule.parameter`
        # needs to be considered outside the current module, hence the check with additional dots.
        tied_param_groups = [
            tied_group
            for tied_group in tied_parameters
            if any(name + "." in k + "." for k in tied_group) and not all(name + "." in k + "." for k in tied_group)
        ]

        if verbose and len(tied_param_groups) > 0:
            print(f"  Found the relevant tied param groups {tied_param_groups}")

        # Then we keep track of all the parameters that are tied to the current module, but not in the current module
        tied_params = sum(
            [[p for p in tied_group if name + "." not in p + "."] for tied_group in tied_param_groups], []
        )

        if verbose and len(tied_params) > 0:
            print(f"  So those parameters need to be taken into account {tied_params}")

        device = devices[current_device]
        current_max_size = max_memory[device] if device != "disk" else None
        current_memory_reserved = 0
        # Reduce max size available by the largest layer.
        if devices[current_device] in main_devices:
            current_max_size = current_max_size - max_layer_size
            current_memory_reserved = max_layer_size

        module_size_with_ties, tied_module_names, tied_modules = get_module_size_with_ties(
            tied_params, module_size, module_sizes, modules_to_treat
        )

        # The module and its tied modules fit on the current device.
        if current_max_size is None or device_memory_used[device] + module_size_with_ties <= current_max_size:
            if verbose:
                output = f"Putting {name}"

                if tied_module_names:
                    output += f" and {tied_module_names}"
                else:
                    output += f" (size={module_size})"

                if current_max_size is not None:
                    output += f" (available={current_max_size - device_memory_used[device]})"

                output += f" on {device}."
                print(output)

            device_memory_used[device] += module_size_with_ties

            # Assign the primary module to the device.
            device_map[name] = device

            # Assign tied modules if any.
            for tied_module_name in tied_module_names:
                if tied_module_name in [m[0] for m in modules_to_treat]:
                    # Find the index of the tied module in the list
                    tied_module_index = next(i for i, (n, _) in enumerate(modules_to_treat) if n == tied_module_name)
                    # Remove the tied module from the list to prevent reprocessing
                    modules_to_treat.pop(tied_module_index)

                # Assign the tied module to the device
                device_map[tied_module_name] = device

            # Buffer Handling
            if not offload_buffers and isinstance(module, nn.Module):
                # Compute the total buffer size for the module
                current_buffer_size = compute_module_total_buffer_size(
                    module, dtype=dtype, special_dtypes=special_dtypes
                )
                # Update the buffer size on the device
                device_buffer_sizes[device] = device_buffer_sizes.get(device, 0) + current_buffer_size

            continue

        # The current module itself fits, so we try to split the tied modules.
        if len(tied_params) > 0 and device_memory_used[device] + module_size <= current_max_size:
            # can we split one of the tied modules to make it smaller or do we need to go on the next device?
            if verbose:
                print(
                    f"Not enough space on {devices[current_device]} to put {name} and {tied_module_names} (space "
                    f"available {current_max_size - device_memory_used[device]}, needed size {module_size_with_ties})."
                )
            split_happened = False
            for tied_module_name, tied_module in zip(tied_module_names, tied_modules):
                tied_module_children = list(tied_module.named_children())
                if len(tied_module_children) == 0 or tied_module.__class__.__name__ in no_split_module_classes:
                    # can't break this one.
                    continue

                if verbose:
                    print(f"Splitting {tied_module_name}.")
                tied_module_children = list(tied_module.named_parameters(recurse=False)) + tied_module_children
                tied_module_children = [(f"{tied_module_name}.{n}", v) for n, v in tied_module_children]
                tied_module_index = [i for i, (n, _) in enumerate(modules_to_treat) if n == tied_module_name][0]

                modules_to_treat = (
                    [(name, module)]
                    + modules_to_treat[:tied_module_index]
                    + tied_module_children
                    + modules_to_treat[tied_module_index + 1 :]
                )
                # Update the max layer size.
                max_layer_size, max_layer_names = get_max_layer_size(
                    [(n, m) for n, m in modules_to_treat if isinstance(m, torch.nn.Module)],
                    module_sizes,
                    no_split_module_classes,
                )
                split_happened = True
                break

            if split_happened:
                continue

            # If the tied module is not split, we go to the next device
            if verbose:
                print("None of the tied module can be split, going to the next device.")

        # The current module itself doesn't fit, so we have to split it or go to the next device.
        if device_memory_used[device] + module_size >= current_max_size:
            # Split or not split?
            modules_children = (
                []
                if isinstance(module, nn.Parameter) or isinstance(module, torch.Tensor)
                else list(module.named_children())
            )
            if verbose:
                print(
                    f"Not enough space on {devices[current_device]} to put {name} (space available "
                    f"{current_max_size - device_memory_used[device]}, module size {module_size})."
                )
            if len(modules_children) == 0 or module.__class__.__name__ in no_split_module_classes:
                # -> no split, we go to the next device
                if verbose:
                    print("This module cannot be split, going to the next device.")

            else:
                # -> split, we replace the module studied by its children + parameters
                if verbose:
                    print(f"Splitting {name}.")
                modules_children = list(module.named_parameters(recurse=False)) + modules_children
                modules_to_treat = [(f"{name}.{n}", v) for n, v in modules_children] + modules_to_treat
                # Update the max layer size.
                max_layer_size, max_layer_names = get_max_layer_size(
                    [(n, m) for n, m in modules_to_treat if isinstance(m, torch.nn.Module)],
                    module_sizes,
                    no_split_module_classes,
                )
                continue

        # If no module is assigned to the current device, we attempt to allocate a fallback module
        # if fallback_allocation is enabled.
        if device_memory_used[device] == 0 and fallback_allocation and device != "disk":
            # We try to allocate a module that fits in the size limit using BFS.
            # Recompute the current max size as we need to consider the current module as well.
            current_max_size = max_memory[device] - max(max_layer_size, module_size_with_ties)

            fallback_module_name, fallback_module, remaining_modules = fallback_allocate(
                modules_to_treat,
                module_sizes,
                current_max_size - device_memory_used[device],
                no_split_module_classes,
                tied_parameters,
            )
            # use the next iteration to put the fallback module on the next device to avoid code duplication
            if fallback_module is not None:
                modules_to_treat = [(fallback_module_name, fallback_module)] + [(name, module)] + remaining_modules
                continue

        if device_memory_used[device] == 0:
            device_minimum_assignment_memory[device] = module_size_with_ties + current_memory_reserved

        #  Neither the current module nor any tied modules can be split, so we move to the next device.
        device_memory_used[device] = device_memory_used[device] + current_memory_reserved
        current_device += 1
        modules_to_treat = [(name, module)] + modules_to_treat

    device_memory_used = {device: mem for device, mem in device_memory_used.items() if mem > 0}

    if clean_result:
        device_map = clean_device_map(device_map)

    non_gpu_buffer_size = device_buffer_sizes.get("cpu", 0) + device_buffer_sizes.get("disk", 0)
    if non_gpu_buffer_size > 0 and not offload_buffers:
        is_buffer_fit_any_gpu = False
        for gpu_device, gpu_max_memory in max_memory.items():
            if gpu_device == "cpu" or gpu_device == "disk":
                continue

            if not is_buffer_fit_any_gpu:
                gpu_memory_used = device_memory_used.get(gpu_device, 0)

                if gpu_max_memory >= non_gpu_buffer_size + gpu_memory_used:
                    is_buffer_fit_any_gpu = True

        if len(gpus) > 0 and not is_buffer_fit_any_gpu:
            warnings.warn(
                f"Current model requires {non_gpu_buffer_size} bytes of buffer for offloaded layers, which seems does "
                f"not fit any GPU's remaining memory. If you are experiencing a OOM later, please consider using "
                f"offload_buffers=True."
            )

    if device_minimum_assignment_memory:
        devices_info = "\n".join(
            f"  - {device}: {mem} bytes required" for device, mem in device_minimum_assignment_memory.items()
        )
        logger.info(
            f"Based on the current allocation process, no modules could be assigned to the following devices due to "
            f"insufficient memory:\n"
            f"{devices_info}\n"
            f"These minimum requirements are specific to this allocation attempt and may vary. Consider increasing "
            f"the available memory for these devices to at least the specified minimum, or adjusting the model config."
        )
    return device_map


def check_device_map(model: nn.Module, device_map: dict[str, Union[int, str, torch.device]]):
    """
    Checks a device map covers everything in a given model.

    Args:
        model (`torch.nn.Module`): The model to check the device map against.
        device_map (`Dict[str, Union[int, str, torch.device]]`): The device map to check.
    """
    all_module_names = dict(model.named_modules())
    invalid_keys = [k for k in device_map if k != "" and k not in all_module_names]

    if invalid_keys:
        warnings.warn(
            f"The following device_map keys do not match any submodules in the model: {invalid_keys}", UserWarning
        )

    all_model_tensors = [name for name, _ in model.state_dict().items()]
    for module_name in device_map.keys():
        if module_name == "":
            all_model_tensors.clear()
            break
        else:
            all_model_tensors = [
                name
                for name in all_model_tensors
                if not name == module_name and not name.startswith(module_name + ".")
            ]
    if len(all_model_tensors) > 0:
        non_covered_params = ", ".join(all_model_tensors)
        raise ValueError(
            f"The device_map provided does not give any device for the following parameters: {non_covered_params}"
        )


def load_state_dict(checkpoint_file, device_map=None):
    """
    Load a checkpoint from a given file. If the checkpoint is in the safetensors format and a device map is passed, the
    weights can be fast-loaded directly on the GPU.

    Args:
        checkpoint_file (`str`): The path to the checkpoint to load.
        device_map (`Dict[str, Union[int, str, torch.device]]`, *optional*):
            A map that specifies where each submodule should go. It doesn't need to be refined to each parameter/buffer
            name, once a given module name is inside, every submodule of it will be sent to the same device.
    """
    if checkpoint_file.endswith(".safetensors"):
        with safe_open(checkpoint_file, framework="pt") as f:
            metadata = f.metadata()
            weight_names = f.keys()

        if metadata is None:
            logger.warning(
                f"The safetensors archive passed at {checkpoint_file} does not contain metadata. "
                "Make sure to save your model with the `save_pretrained` method. Defaulting to 'pt' metadata."
            )
            metadata = {"format": "pt"}

        if metadata.get("format") not in ["pt", "tf", "flax"]:
            raise OSError(
                f"The safetensors archive passed at {checkpoint_file} does not contain the valid metadata. Make sure "
                "you save your model with the `save_pretrained` method."
            )
        elif metadata["format"] != "pt":
            raise ValueError(f"The checkpoint passed was saved with {metadata['format']}, we need a the pt format.")
        if device_map is None:
            return safe_load_file(checkpoint_file)
        else:
            # if we only have one device we can load everything directly
            if len(set(device_map.values())) == 1:
                device = list(device_map.values())[0]
                target_device = device
                if isinstance(device, int):
                    if is_npu_available():
                        target_device = f"npu:{device}"
                    elif is_hpu_available():
                        target_device = "hpu"

                return safe_load_file(checkpoint_file, device=target_device)

            devices = list(set(device_map.values()) - {"disk"})
            # cpu device should always exist as fallback option
            if "cpu" not in devices:
                devices.append("cpu")

            # For each device, get the weights that go there
            device_weights = {device: [] for device in devices}
            for module_name, device in device_map.items():
                if device in devices:
                    device_weights[device].extend(
                        [k for k in weight_names if k == module_name or k.startswith(module_name + ".")]
                    )

            # all weights that haven't defined a device should be loaded on CPU
            device_weights["cpu"].extend([k for k in weight_names if k not in sum(device_weights.values(), [])])
            tensors = {}
            if is_tqdm_available():
                progress_bar = tqdm(
                    main_process_only=False,
                    total=sum([len(device_weights[device]) for device in devices]),
                    unit="w",
                    smoothing=0,
                    leave=False,
                )
            else:
                progress_bar = None
            for device in devices:
                target_device = device
                if isinstance(device, int):
                    if is_npu_available():
                        target_device = f"npu:{device}"
                    elif is_hpu_available():
                        target_device = "hpu"

                with safe_open(checkpoint_file, framework="pt", device=target_device) as f:
                    for key in device_weights[device]:
                        if progress_bar is not None:
                            progress_bar.set_postfix(dev=device, refresh=False)
                            progress_bar.set_description(key)
                        tensors[key] = f.get_tensor(key)
                        if progress_bar is not None:
                            progress_bar.update()
            if progress_bar is not None:
                progress_bar.close()

            return tensors
    else:
        return torch.load(checkpoint_file, map_location=torch.device("cpu"), weights_only=True)


def get_state_dict_offloaded_model(model: nn.Module):
    """
    Returns the state dictionary for an offloaded model via iterative onloading

    Args:
        model (`torch.nn.Module`):
            The offloaded model we want to save
    """

    state_dict = {}
    placeholders = set()
    for name, module in model.named_modules():
        if name == "":
            continue

        try:
            with align_module_device(module, "cpu"):
                module_state_dict = module.state_dict()
        except MemoryError:
            raise MemoryError("Offloaded module must fit in CPU memory to call save_model!") from None

        for key in module_state_dict:
            # ignore placeholder parameters that are still on the meta device
            if module_state_dict[key].device == torch.device("meta"):
                placeholders.add(name + f".{key}")
                continue
            params = module_state_dict[key]
            state_dict[name + f".{key}"] = params.to("cpu")  # move buffers to cpu
    for key in placeholders.copy():
        if key in state_dict:
            placeholders.remove(key)
    if placeholders:
        logger.warning(f"The following tensors were not saved because they were still on meta device: {placeholders}")

    return state_dict


def get_state_dict_from_offload(
    module: nn.Module,
    module_name: str,
    state_dict: dict[str, Union[str, torch.tensor]],
    device_to_put_offload: Union[int, str, torch.device] = "cpu",
):
    """
    Retrieve the state dictionary (with parameters) from an offloaded module and load into a specified device (defaults
    to cpu).

    Args:
        module: (`torch.nn.Module`):
            The module we want to retrieve a state dictionary from
        module_name: (`str`):
            The name of the module of interest
        state_dict (`Dict[str, Union[int, str, torch.device]]`):
            Dictionary of {module names: parameters}
        device_to_put_offload (`Union[int, str, torch.device]`):
            Device to load offloaded parameters into, defaults to the cpu.
    """

    root = module_name[: module_name.rfind(".")]  # module name without .weight or .bias

    # do not move parameters if the module is not offloaded
    if not has_offloaded_params(module):
        device_to_put_offload = None

    # assign the device to which the offloaded parameters will be sent
    with align_module_device(module, device_to_put_offload):
        for m_key, params in module.state_dict().items():
            if (root + f".{m_key}") in state_dict:
                state_dict[root + f".{m_key}"] = params

    return state_dict


def load_checkpoint_in_model(
    model: nn.Module,
    checkpoint: Union[str, os.PathLike],
    device_map: Optional[dict[str, Union[int, str, torch.device]]] = None,
    offload_folder: Optional[Union[str, os.PathLike]] = None,
    dtype: Optional[Union[str, torch.dtype]] = None,
    offload_state_dict: bool = False,
    offload_buffers: bool = False,
    keep_in_fp32_modules: Optional[list[str]] = None,
    offload_8bit_bnb: bool = False,
    strict: bool = False,
    full_state_dict: bool = True,
    broadcast_from_rank0: bool = False,
):
    """
    Loads a (potentially sharded) checkpoint inside a model, potentially sending weights to a given device as they are
    loaded.

    <Tip warning={true}>

    Once loaded across devices, you still need to call [`dispatch_model`] on your model to make it able to run. To
    group the checkpoint loading and dispatch in one single call, use [`load_checkpoint_and_dispatch`].

    </Tip>

    Args:
        model (`torch.nn.Module`):
            The model in which we want to load a checkpoint.
        checkpoint (`str` or `os.PathLike`):
            The folder checkpoint to load. It can be:
            - a path to a file containing a whole model state dict
            - a path to a `.json` file containing the index to a sharded checkpoint
            - a path to a folder containing a unique `.index.json` file and the shards of a checkpoint.
            - a path to a folder containing a unique pytorch_model.bin or a model.safetensors file.
        device_map (`Dict[str, Union[int, str, torch.device]]`, *optional*):
            A map that specifies where each submodule should go. It doesn't need to be refined to each parameter/buffer
            name, once a given module name is inside, every submodule of it will be sent to the same device.
        offload_folder (`str` or `os.PathLike`, *optional*):
            If the `device_map` contains any value `"disk"`, the folder where we will offload weights.
        dtype (`str` or `torch.dtype`, *optional*):
            If provided, the weights will be converted to that type when loaded.
        offload_state_dict (`bool`, *optional*, defaults to `False`):
            If `True`, will temporarily offload the CPU state dict on the hard drive to avoid getting out of CPU RAM if
            the weight of the CPU state dict + the biggest shard does not fit.
        offload_buffers (`bool`, *optional*, defaults to `False`):
            Whether or not to include the buffers in the weights offloaded to disk.
        keep_in_fp32_modules(`List[str]`, *optional*):
            A list of the modules that we keep in `torch.float32` dtype.
        offload_8bit_bnb (`bool`, *optional*):
            Whether or not to enable offload of 8-bit modules on cpu/disk.
        strict (`bool`, *optional*, defaults to `False`):
            Whether to strictly enforce that the keys in the checkpoint state_dict match the keys of the model's
            state_dict.
        full_state_dict (`bool`, *optional*, defaults to `True`): if this is set to `True`, all the tensors in the
            loaded state_dict will be gathered. No ShardedTensor and DTensor will be in the loaded state_dict.
        broadcast_from_rank0 (`False`, *optional*, defaults to `False`): when the option is `True`, a distributed
            `ProcessGroup` must be initialized. rank0 should receive a full state_dict and will broadcast the tensors
            in the state_dict one by one to other ranks. Other ranks will receive the tensors and shard (if applicable)
            according to the local shards in the model.

    """
    if offload_8bit_bnb:
        from .bnb import quantize_and_offload_8bit

    tied_params = find_tied_parameters(model)

    if check_tied_parameters_in_config(model) and len(tied_params) == 0:
        logger.warning(
            "The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function."
        )
    if device_map is not None:
        check_tied_parameters_on_same_device(tied_params, device_map)

    if offload_folder is None and device_map is not None and "disk" in device_map.values():
        raise ValueError(
            "At least one of the model submodule will be offloaded to disk, please pass along an `offload_folder`."
        )
    elif offload_folder is not None and device_map is not None and "disk" in device_map.values():
        os.makedirs(offload_folder, exist_ok=True)

    if isinstance(dtype, str):
        # We accept "torch.float16" or just "float16"
        dtype = dtype.replace("torch.", "")
        dtype = getattr(torch, dtype)

    checkpoint_files = None
    index_filename = None
    if os.path.isfile(checkpoint):
        if str(checkpoint).endswith(".json"):
            index_filename = checkpoint
        else:
            checkpoint_files = [checkpoint]
    elif os.path.isdir(checkpoint):
        # check if the whole state dict is present
        potential_state_bin = [f for f in os.listdir(checkpoint) if f == WEIGHTS_NAME]
        potential_state_safetensor = [f for f in os.listdir(checkpoint) if f == SAFE_WEIGHTS_NAME]
        if len(potential_state_bin) == 1:
            checkpoint_files = [os.path.join(checkpoint, potential_state_bin[0])]
        elif len(potential_state_safetensor) == 1:
            checkpoint_files = [os.path.join(checkpoint, potential_state_safetensor[0])]
        else:
            # otherwise check for sharded checkpoints
            potential_index = [f for f in os.listdir(checkpoint) if f.endswith(".index.json")]
            if len(potential_index) == 0:
                raise ValueError(
                    f"{checkpoint} is not a folder containing a `.index.json` file or a {WEIGHTS_NAME} or a {SAFE_WEIGHTS_NAME} file"
                )
            elif len(potential_index) == 1:
                index_filename = os.path.join(checkpoint, potential_index[0])
            else:
                raise ValueError(
                    f"{checkpoint} containing more than one `.index.json` file, delete the irrelevant ones."
                )
    else:
        raise ValueError(
            "`checkpoint` should be the path to a file containing a whole state dict, or the index of a sharded "
            f"checkpoint, or a folder containing a sharded checkpoint or the whole state dict, but got {checkpoint}."
        )

    if index_filename is not None:
        checkpoint_folder = os.path.split(index_filename)[0]
        with open(index_filename) as f:
            index = json.loads(f.read())

        if "weight_map" in index:
            index = index["weight_map"]
        checkpoint_files = sorted(list(set(index.values())))
        checkpoint_files = [os.path.join(checkpoint_folder, f) for f in checkpoint_files]

    # Logic for missing/unexepected keys goes here.

    offload_index = {}
    if offload_state_dict:
        state_dict_folder = tempfile.mkdtemp()
        state_dict_index = {}

    unexpected_keys = set()
    model_keys = set(model.state_dict().keys())
    buffer_names = [name for name, _ in model.named_buffers()]
    model_devices = {t.device for t in model.state_dict().values() if isinstance(t, torch.Tensor)}
    model_physical_devices = model_devices - {torch.device("meta")}
    for checkpoint_file in checkpoint_files:
        if device_map is None:
            # exception for multi-device loading was made for the meta device in torch v2.7.0
            # https://github.com/pytorch/pytorch/blob/v2.6.0/torch/distributed/checkpoint/state_dict.py#L557-L563
            # https://github.com/pytorch/pytorch/blob/v2.7.0-rc2/torch/distributed/checkpoint/state_dict.py#L575-L587
            if is_torch_version(">=", "2.2.0") and (
                (is_torch_version(">=", "2.7.0") and len(model_physical_devices) <= 1) or len(model_devices) <= 1
            ):
                from torch.distributed.checkpoint.state_dict import StateDictOptions, set_model_state_dict

                broadcast_from_rank0 &= is_torch_version(">=", "2.4.0")
                loaded_checkpoint = (
                    load_state_dict(checkpoint_file, device_map=device_map)
                    if not broadcast_from_rank0 or dist.get_rank() == 0
                    else {}
                )
                set_model_state_dict(
                    model,
                    loaded_checkpoint,
                    options=StateDictOptions(
                        full_state_dict=full_state_dict,
                        strict=strict,
                        **({"broadcast_from_rank0": broadcast_from_rank0} if is_torch_version(">=", "2.4.0") else {}),
                    ),
                )
            else:
                loaded_checkpoint = load_state_dict(checkpoint_file, device_map=device_map)
                model.load_state_dict(loaded_checkpoint, strict=strict)

            unexpected_keys.update(set(loaded_checkpoint.keys()) - model_keys)
        else:
            loaded_checkpoint = load_state_dict(checkpoint_file, device_map=device_map)

            for param_name, param in loaded_checkpoint.items():
                # skip SCB parameter (for 8-bit serialization)
                if "SCB" in param_name:
                    continue

                if param_name not in model_keys:
                    unexpected_keys.add(param_name)
                    if not strict:
                        continue  # Skip loading this parameter.

                module_name = param_name

                while len(module_name) > 0 and module_name not in device_map:
                    module_name = ".".join(module_name.split(".")[:-1])
                if module_name == "" and "" not in device_map:
                    # TODO: group all errors and raise at the end.
                    raise ValueError(f"{param_name} doesn't have any device set.")
                param_device = device_map[module_name]
                new_dtype = dtype
                if dtype is not None and torch.is_floating_point(param):
                    if keep_in_fp32_modules is not None and dtype == torch.float16:
                        proceed = False
                        for key in keep_in_fp32_modules:
                            if ((key in param_name) and (key + "." in param_name)) or key == param_name:
                                proceed = True
                                break
                        if proceed:
                            new_dtype = torch.float32

                if "weight" in param_name and param_name.replace("weight", "SCB") in loaded_checkpoint.keys():
                    if param.dtype == torch.int8:
                        fp16_statistics = loaded_checkpoint[param_name.replace("weight", "SCB")]
                else:
                    fp16_statistics = None

                if param_device == "disk":
                    if offload_buffers or param_name not in buffer_names:
                        if new_dtype is None:
                            new_dtype = param.dtype
                        if offload_8bit_bnb:
                            quantize_and_offload_8bit(
                                model, param, param_name, new_dtype, offload_folder, offload_index, fp16_statistics
                            )
                            continue
                        else:
                            set_module_tensor_to_device(model, param_name, "meta", dtype=new_dtype)
                        offload_weight(param, param_name, offload_folder, index=offload_index)
                elif param_device == "cpu" and offload_state_dict:
                    if new_dtype is None:
                        new_dtype = param.dtype
                    if offload_8bit_bnb:
                        quantize_and_offload_8bit(
                            model, param, param_name, new_dtype, state_dict_folder, state_dict_index, fp16_statistics
                        )
                    else:
                        set_module_tensor_to_device(model, param_name, "meta", dtype=new_dtype)
                        offload_weight(param, param_name, state_dict_folder, index=state_dict_index)
                else:
                    set_module_tensor_to_device(
                        model,
                        param_name,
                        param_device,
                        value=param,
                        dtype=new_dtype,
                        fp16_statistics=fp16_statistics,
                    )

        # Force Python to clean up.
        del loaded_checkpoint
        gc.collect()

    if not strict and len(unexpected_keys) > 0:
        logger.warning(
            f"Some weights of the model checkpoint at {checkpoint} were not used when"
            f" initializing {model.__class__.__name__}: {unexpected_keys}. This may or may not be an issue - make sure that the checkpoint does not have unnecessary parameters, or that the model definition correctly corresponds to the checkpoint."
        )

    save_offload_index(offload_index, offload_folder)

    # Load back offloaded state dict on CPU
    if offload_state_dict:
        load_offloaded_weights(model, state_dict_index, state_dict_folder)
        shutil.rmtree(state_dict_folder)

    retie_parameters(model, tied_params)


def get_mixed_precision_context_manager(native_amp: bool = False, autocast_kwargs: AutocastKwargs = None):
    """
    Return a context manager for autocasting mixed precision

    Args:
        native_amp (`bool`, *optional*, defaults to False):
            Whether mixed precision is actually enabled.
        cache_enabled (`bool`, *optional*, defaults to True):
            Whether the weight cache inside autocast should be enabled.
    """
    state = AcceleratorState()
    if autocast_kwargs is None:
        autocast_kwargs = {}
    else:
        autocast_kwargs = autocast_kwargs.to_kwargs()
    if native_amp:
        device_type = (
            "cuda"
            if (state.distributed_type == DistributedType.XLA and is_torch_xla_available(check_is_gpu=True))
            else state.device.type
        )
        if state.mixed_precision == "fp16":
            return torch.autocast(device_type=device_type, dtype=torch.float16, **autocast_kwargs)
        elif state.mixed_precision in ["bf16", "fp8"] and state.distributed_type in [
            DistributedType.NO,
            DistributedType.MULTI_CPU,
            DistributedType.MULTI_GPU,
            DistributedType.MULTI_MLU,
            DistributedType.MULTI_SDAA,
            DistributedType.MULTI_MUSA,
            DistributedType.MULTI_NPU,
            DistributedType.MULTI_XPU,
            DistributedType.MULTI_HPU,
            DistributedType.MULTI_NEURON,
            DistributedType.FSDP,
            DistributedType.XLA,
        ]:
            return torch.autocast(device_type=device_type, dtype=torch.bfloat16, **autocast_kwargs)
        else:
            return torch.autocast(device_type=device_type, **autocast_kwargs)
    else:
        return contextlib.nullcontext()


def get_grad_scaler(distributed_type: DistributedType = None, **kwargs):
    """
    A generic helper which will initialize the correct `GradScaler` implementation based on the environment and return
    it.

    Args:
        distributed_type (`DistributedType`, *optional*, defaults to None):
            The type of distributed environment.
        kwargs:
            Additional arguments for the utilized `GradScaler` constructor.
    """
    if distributed_type == DistributedType.FSDP:
        from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler

        return ShardedGradScaler(**kwargs)
    if is_torch_xla_available(check_is_gpu=True):
        import torch_xla.amp as xamp

        return xamp.GradScaler(**kwargs)
    elif is_mlu_available():
        return torch.mlu.amp.GradScaler(**kwargs)
    elif is_sdaa_available():
        return torch.sdaa.amp.GradScaler(**kwargs)
    elif is_musa_available():
        return torch.musa.amp.GradScaler(**kwargs)
    elif is_npu_available():
        return torch.npu.amp.GradScaler(**kwargs)
    elif is_hpu_available():
        return torch.amp.GradScaler("hpu", **kwargs)
    elif is_xpu_available():
        return torch.amp.GradScaler("xpu", **kwargs)
    elif is_mps_available():
        if not is_torch_version(">=", "2.8.0"):
            raise ValueError("Grad Scaler with MPS device requires a Pytorch >= 2.8.0")
        return torch.amp.GradScaler("mps", **kwargs)
    else:
        if is_torch_version(">=", "2.3"):
            return torch.amp.GradScaler("cuda", **kwargs)
        else:
            return torch.cuda.amp.GradScaler(**kwargs)


def has_offloaded_params(module: torch.nn.Module) -> bool:
    """
    Checks if a module has offloaded parameters by checking if the given module has a AlignDevicesHook attached with
    offloading enabled

    Args:
        module (`torch.nn.Module`): The module to check for an offload hook.

    Returns:
        bool: `True` if the module has an offload hook and offloading is enabled, `False` otherwise.
    """
    from ..hooks import AlignDevicesHook  # avoid circular import

    return hasattr(module, "_hf_hook") and isinstance(module._hf_hook, AlignDevicesHook) and module._hf_hook.offload


@contextlib.contextmanager
def align_module_device(module: torch.nn.Module, execution_device: Optional[torch.device] = None):
    """
    Context manager that moves a module's parameters to the specified execution device.

    Args:
        module (`torch.nn.Module`):
            Module with parameters to align.
        execution_device (`torch.device`, *optional*):
            If provided, overrides the module's execution device within the context. Otherwise, use hook execution
            device or pass
    """
    if has_offloaded_params(module):
        if execution_device is not None:
            original_device = module._hf_hook.execution_device
            module._hf_hook.execution_device = execution_device

        try:
            module._hf_hook.pre_forward(module)
            yield
        finally:
            module._hf_hook.post_forward(module, None)
            if execution_device is not None:
                module._hf_hook.execution_device = original_device

    elif execution_device is not None:
        devices = {name: param.device for name, param in module.named_parameters(recurse=False)}
        try:
            for name in devices:
                set_module_tensor_to_device(module, name, execution_device)
            yield
        finally:
            for name, device in devices.items():
                set_module_tensor_to_device(module, name, device)

    else:
        yield


================================================
FILE: src/accelerate/utils/offload.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import os
from collections.abc import Mapping
from typing import Optional, Union

import numpy as np
import torch
from safetensors import safe_open


def offload_weight(weight, weight_name, offload_folder, index=None):
    dtype = None
    # Check the string instead of the dtype to be compatible with versions of PyTorch that don't have bfloat16.
    if str(weight.dtype) == "torch.bfloat16":
        # Need to reinterpret the underlined data as int16 since NumPy does not handle bfloat16s.
        weight = weight.view(torch.int16)
        dtype = "bfloat16"
    array = weight.cpu().numpy()
    tensor_file = os.path.join(offload_folder, f"{weight_name}.dat")
    if index is not None:
        if dtype is None:
            dtype = str(array.dtype)
        index[weight_name] = {"dtype": dtype, "shape": list(array.shape)}
    if array.ndim == 0:
        array = array[None]
    file_array = np.memmap(tensor_file, dtype=array.dtype, mode="w+", shape=array.shape)
    file_array[:] = array[:]
    file_array.flush()
    return index


def load_offloaded_weight(weight_file, weight_info):
    shape = tuple(weight_info["shape"])
    if shape == ():
        # NumPy memory-mapped arrays can't have 0 dims so it was saved as 1d tensor
        shape = (1,)

    dtype = weight_info["dtype"]
    if dtype == "bfloat16":
        # NumPy does not support bfloat16 so this was saved as a int16
        dtype = "int16"

    weight = np.memmap(weight_file, dtype=dtype, shape=shape, mode="r")

    if len(weight_info["shape"]) == 0:
        weight = weight[0]
    weight = torch.tensor(weight)
    if weight_info["dtype"] == "bfloat16":
        weight = weight.view(torch.bfloat16)

    return weight


def save_offload_index(index, offload_folder):
    if index is None or len(index) == 0:
        # Nothing to save
        return

    offload_index_file = os.path.join(offload_folder, "index.json")
    if os.path.isfile(offload_index_file):
        with open(offload_index_file, encoding="utf-8") as f:
            current_index = json.load(f)
    else:
        current_index = {}
    current_index.update(index)

    with open(offload_index_file, "w", encoding="utf-8") as f:
        json.dump(current_index, f, indent=2)


def offload_state_dict(save_dir: Union[str, os.PathLike], state_dict: dict[str, torch.Tensor]):
    """
    Offload a state dict in a given folder.

    Args:
        save_dir (`str` or `os.PathLike`):
            The directory in which to offload the state dict.
        state_dict (`Dict[str, torch.Tensor]`):
            The dictionary of tensors to offload.
    """
    os.makedirs(save_dir, exist_ok=True)
    index = {}
    for name, parameter in state_dict.items():
        index = offload_weight(parameter, name, save_dir, index=index)

    # Update index
    save_offload_index(index, save_dir)


class PrefixedDataset(Mapping):
    """
    Will access keys in a given dataset by adding a prefix.

    Args:
        dataset (`Mapping`): Any map with string keys.
        prefix (`str`): A prefix to add when trying to access any element in the underlying dataset.
    """

    def __init__(self, dataset: Mapping, prefix: str):
        self.dataset = dataset
        self.prefix = prefix

    def __getitem__(self, key):
        return self.dataset[f"{self.prefix}{key}"]

    def __iter__(self):
        return iter([key for key in self.dataset if key.startswith(self.prefix)])

    def __len__(self):
        return len(self.dataset)


class OffloadedWeightsLoader(Mapping):
    """
    A collection that loads weights stored in a given state dict or memory-mapped on disk.

    Args:
        state_dict (`Dict[str, torch.Tensor]`, *optional*):
            A dictionary parameter name to tensor.
        save_folder (`str` or `os.PathLike`, *optional*):
            The directory in which the weights are stored (by `offload_state_dict` for instance).
        index (`Dict`, *optional*):
            A dictionary from weight name to their information (`dtype`/ `shape` or safetensors filename). Will default
            to the index saved in `save_folder`.
    """

    def __init__(
        self,
        state_dict: Optional[dict[str, torch.Tensor]] = None,
        save_folder: Optional[Union[str, os.PathLike]] = None,
        index: Optional[Mapping] = None,
        device=None,
    ):
        if state_dict is None and save_folder is None and index is None:
            raise ValueError("Need either a `state_dict`, a `save_folder` or an `index` containing offloaded weights.")

        self.state_dict = {} if state_dict is None else state_dict
        self.save_folder = save_folder
        if index is None and save_folder is not None:
            with open(os.path.join(save_folder, "index.json")) as f:
                index = json.load(f)
        self.index = {} if index is None else index
        self.all_keys = list(self.state_dict.keys())
        self.all_keys.extend([key for key in self.index if key not in self.all_keys])
        self.device = device

    def __getitem__(self, key: str):
        # State dict gets priority
        if key in self.state_dict:
            return self.state_dict[key]
        weight_info = self.index[key]
        if weight_info.get("safetensors_file") is not None:
            device = "cpu" if self.device is None else self.device
            tensor = None
            try:
                with safe_open(weight_info["safetensors_file"], framework="pt", device=device) as f:
                    tensor = f.get_tensor(weight_info.get("weight_name", key))
            except TypeError:
                # if failed to get_tensor on the device, such as bf16 on mps, try to load it on CPU first
                with safe_open(weight_info["safetensors_file"], framework="pt", device="cpu") as f:
                    tensor = f.get_tensor(weight_info.get("weight_name", key))

            if "dtype" in weight_info:
                tensor = tensor.to(getattr(torch, weight_info["dtype"]))

            if tensor.device != torch.device(device):
                tensor = tensor.to(device)
            return tensor

        weight_file = os.path.join(self.save_folder, f"{key}.dat")
        return load_offloaded_weight(weight_file, weight_info)

    def __iter__(self):
        return iter(self.all_keys)

    def __len__(self):
        return len(self.all_keys)


def extract_submodules_state_dict(state_dict: dict[str, torch.Tensor], submodule_names: list[str]):
    """
    Extract the sub state-dict corresponding to a list of given submodules.

    Args:
        state_dict (`Dict[str, torch.Tensor]`): The state dict to extract from.
        submodule_names (`List[str]`): The list of submodule names we want to extract.
    """
    result = {}
    for module_name in submodule_names:
        # We want to catch module_name parameter (module_name.xxx) or potentially module_name, but not any of the
        # submodules that could being like module_name (transformers.h.1 and transformers.h.10 for instance)
        result.update(
            {
                key: param
                for key, param in state_dict.items()
                if key == module_name or key.startswith(module_name + ".")
            }
        )
    return result


================================================
FILE: src/accelerate/utils/operations.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
A set of basic tensor ops compatible with tpu, gpu, and multigpu
"""

import pickle
import warnings
from collections.abc import Mapping
from contextlib import contextmanager, nullcontext
from functools import update_wrapper, wraps
from typing import Any

import torch

from ..state import AcceleratorState, PartialState
from .constants import TORCH_DISTRIBUTED_OPERATION_TYPES
from .dataclasses import DistributedType, TensorInformation
from .imports import (
    is_npu_available,
    is_torch_distributed_available,
    is_torch_xla_available,
)
from .versions import is_torch_version


if is_torch_xla_available():
    import torch_xla.core.xla_model as xm

if is_torch_distributed_available():
    from torch.distributed import ReduceOp


def is_torch_tensor(tensor):
    return isinstance(tensor, torch.Tensor)


def is_torch_xpu_tensor(tensor):
    return isinstance(
        tensor,
        torch.xpu.FloatTensor,
        torch.xpu.ByteTensor,
        torch.xpu.IntTensor,
        torch.xpu.LongTensor,
        torch.xpu.HalfTensor,
        torch.xpu.DoubleTensor,
        torch.xpu.BFloat16Tensor,
    )


def is_tensor_information(tensor_info):
    return isinstance(tensor_info, TensorInformation)


def is_namedtuple(data):
    """
    Checks if `data` is a `namedtuple` or not. Can have false positives, but only if a user is trying to mimic a
    `namedtuple` perfectly.
    """
    return isinstance(data, tuple) and hasattr(data, "_asdict") and hasattr(data, "_fields")


def honor_type(obj, generator):
    """
    Cast a generator to the same type as obj (list, tuple, or namedtuple)
    """
    # Some objects may not be able to instantiate from a generator directly
    if is_namedtuple(obj):
        return type(obj)(*list(generator))
    else:
        return type(obj)(generator)


def recursively_apply(func, data, *args, test_type=is_torch_tensor, error_on_other_type=False, **kwargs):
    """
    Recursively apply a function on a data structure that is a nested list/tuple/dictionary of a given base type.

    Args:
        func (`callable`):
            The function to recursively apply.
        data (nested list/tuple/dictionary of `main_type`):
            The data on which to apply `func`
        *args:
            Positional arguments that will be passed to `func` when applied on the unpacked data.
        main_type (`type`, *optional*, defaults to `torch.Tensor`):
            The base type of the objects to which apply `func`.
        error_on_other_type (`bool`, *optional*, defaults to `False`):
            Whether to return an error or not if after unpacking `data`, we get on an object that is not of type
            `main_type`. If `False`, the function will leave objects of types different than `main_type` unchanged.
        **kwargs (additional keyword arguments, *optional*):
            Keyword arguments that will be passed to `func` when applied on the unpacked data.

    Returns:
        The same data structure as `data` with `func` applied to every object of type `main_type`.
    """
    if isinstance(data, (tuple, list)):
        return honor_type(
            data,
            (
                recursively_apply(
                    func, o, *args, test_type=test_type, error_on_other_type=error_on_other_type, **kwargs
                )
                for o in data
            ),
        )
    elif isinstance(data, Mapping):
        return type(data)(
            {
                k: recursively_apply(
                    func, v, *args, test_type=test_type, error_on_other_type=error_on_other_type, **kwargs
                )
                for k, v in data.items()
            }
        )
    elif test_type(data):
        return func(data, *args, **kwargs)
    elif error_on_other_type:
        raise TypeError(
            f"Unsupported types ({type(data)}) passed to `{func.__name__}`. Only nested list/tuple/dicts of "
            f"objects that are valid for `{test_type.__name__}` should be passed."
        )
    return data


def send_to_device(tensor, device, non_blocking=False, skip_keys=None):
    """
    Recursively sends the elements in a nested list/tuple/dictionary of tensors to a given device.

    Args:
        tensor (nested list/tuple/dictionary of `torch.Tensor`):
            The data to send to a given device.
        device (`torch.device`):
            The device to send the data to.

    Returns:
        The same data structure as `tensor` with all tensors sent to the proper device.
    """
    if is_torch_tensor(tensor) or hasattr(tensor, "to"):
        # `torch.Tensor.to("npu")` could not find context when called for the first time (see this [issue](https://gitee.com/ascend/pytorch/issues/I8KECW?from=project-issue)).
        if device == "npu":
            device = "npu:0"
        try:
            return tensor.to(device, non_blocking=non_blocking)
        except TypeError:  # .to() doesn't accept non_blocking as kwarg
            return tensor.to(device)
        except AssertionError as error:
            # `torch.Tensor.to(<int num>)` is not supported by `torch_npu` (see this [issue](https://github.com/Ascend/pytorch/issues/16)).
            # This call is inside the try-block since is_npu_available is not supported by torch.compile.
            if is_npu_available():
                if isinstance(device, int):
                    device = f"npu:{device}"
            else:
                raise error
        try:
            return tensor.to(device, non_blocking=non_blocking)
        except TypeError:  # .to() doesn't accept non_blocking as kwarg
            return tensor.to(device)
    elif isinstance(tensor, (tuple, list)):
        return honor_type(
            tensor, (send_to_device(t, device, non_blocking=non_blocking, skip_keys=skip_keys) for t in tensor)
        )
    elif isinstance(tensor, Mapping):
        if isinstance(skip_keys, str):
            skip_keys = [skip_keys]
        elif skip_keys is None:
            skip_keys = []
        return type(tensor)(
            {
                k: t if k in skip_keys else send_to_device(t, device, non_blocking=non_blocking, skip_keys=skip_keys)
                for k, t in tensor.items()
            }
        )
    else:
        return tensor


def get_data_structure(data):
    """
    Recursively gathers the information needed to rebuild a nested list/tuple/dictionary of tensors.

    Args:
        data (nested list/tuple/dictionary of `torch.Tensor`):
            The data to send to analyze.

    Returns:
        The same data structure as `data` with [`~utils.TensorInformation`] instead of tensors.
    """

    def _get_data_structure(tensor):
        return TensorInformation(shape=tensor.shape, dtype=tensor.dtype)

    return recursively_apply(_get_data_structure, data)


def get_shape(data):
    """
    Recursively gathers the shape of a nested list/tuple/dictionary of tensors as a list.

    Args:
        data (nested list/tuple/dictionary of `torch.Tensor`):
            The data to send to analyze.

    Returns:
        The same data structure as `data` with lists of tensor shapes instead of tensors.
    """

    def _get_shape(tensor):
        return list(tensor.shape)

    return recursively_apply(_get_shape, data)


def initialize_tensors(data_structure):
    """
    Recursively initializes tensors from a nested list/tuple/dictionary of [`~utils.TensorInformation`].

    Returns:
        The same data structure as `data` with tensors instead of [`~utils.TensorInformation`].
    """

    def _initialize_tensor(tensor_info):
        return torch.empty(*tensor_info.shape, dtype=tensor_info.dtype)

    return recursively_apply(_initialize_tensor, data_structure, test_type=is_tensor_information)


def find_batch_size(data):
    """
    Recursively finds the batch size in a nested list/tuple/dictionary of lists of tensors.

    Args:
        data (nested list/tuple/dictionary of `torch.Tensor`): The data from which to find the batch size.

    Returns:
        `int`: The batch size.
    """
    if isinstance(data, (tuple, list, Mapping)) and (len(data) == 0):
        raise ValueError(f"Cannot find the batch size from empty {type(data)}.")

    if isinstance(data, (tuple, list)):
        return find_batch_size(data[0])
    elif isinstance(data, Mapping):
        for k in data.keys():
            return find_batch_size(data[k])
    elif not isinstance(data, torch.Tensor):
        raise TypeError(f"Can only find the batch size of tensors but got {type(data)}.")
    return data.shape[0]


def ignorant_find_batch_size(data):
    """
    Same as [`utils.operations.find_batch_size`] except will ignore if `ValueError` and `TypeErrors` are raised

    Args:
        data (nested list/tuple/dictionary of `torch.Tensor`): The data from which to find the batch size.

    Returns:
        `int`: The batch size.
    """
    try:
        return find_batch_size(data)
    except (ValueError, TypeError):
        pass
    return None


def listify(data):
    """
    Recursively finds tensors in a nested list/tuple/dictionary and converts them to a list of numbers.

    Args:
        data (nested list/tuple/dictionary of `torch.Tensor`): The data from which to convert to regular numbers.

    Returns:
        The same data structure as `data` with lists of numbers instead of `torch.Tensor`.
    """

    def _convert_to_list(tensor):
        tensor = tensor.detach().cpu()
        if tensor.dtype == torch.bfloat16:
            # As of Numpy 1.21.4, NumPy does not support bfloat16 (see
            # https://github.com/numpy/numpy/blob/a47ecdea856986cd60eabbd53265c2ca5916ad5d/doc/source/user/basics.types.rst ).
            # Until Numpy adds bfloat16, we must convert float32.
            tensor = tensor.to(torch.float32)
        return tensor.tolist()

    return recursively_apply(_convert_to_list, data)


def _tpu_gather(tensor):
    def _tpu_gather_one(tensor):
        if tensor.ndim == 0:
            tensor = tensor.clone()[None]

        # Can only gather contiguous tensors
        if not tensor.is_contiguous():
            tensor = tensor.contiguous()
        return xm.all_gather(tensor)

    res = recursively_apply(_tpu_gather_one, tensor, error_on_other_type=True)
    xm.mark_step()
    return res


def _gpu_gather(tensor):
    state = PartialState()
    gather_op = torch.distributed.all_gather_into_tensor

    # NOTE: need manually synchronize to workaourd a INT64 collectives bug in oneCCL before torch 2.9.0
    if state.device.type == "xpu" and is_torch_version("<=", "2.8"):
        torch.xpu.synchronize()

    def _gpu_gather_one(tensor):
        if tensor.ndim == 0:
            tensor = tensor.clone()[None]

        # Can only gather contiguous tensors
        if not tensor.is_contiguous():
            tensor = tensor.contiguous()

        if state.backend is not None and state.backend != "gloo":
            # We use `empty` as `all_gather_into_tensor` slightly
            # differs from `all_gather` for better efficiency,
            # and we rely on the number of items in the tensor
            # rather than its direct shape
            output_tensors = torch.empty(
                state.num_processes * tensor.numel(),
                dtype=tensor.dtype,
                device=state.device,
            )
            gather_op(output_tensors, tensor)
            return output_tensors.view(-1, *tensor.size()[1:])
        else:
            # a backend of `None` is always CPU
            # also gloo does not support `all_gather_into_tensor`,
            # which will result in a larger memory overhead for the op
            output_tensors = [torch.empty_like(tensor) for _ in range(state.num_processes)]
            torch.distributed.all_gather(output_tensors, tensor)
            return torch.cat(output_tensors, dim=0)

    return recursively_apply(_gpu_gather_one, tensor, error_on_other_type=True)


class DistributedOperationException(Exception):
    """
    An exception class for distributed operations. Raised if the operation cannot be performed due to the shape of the
    tensors.
    """

    pass


def verify_operation(function):
    """
    Verifies that `tensor` is the same shape across all processes. Only ran if `PartialState().debug` is `True`.
    """

    @wraps(function)
    def wrapper(*args, **kwargs):
        if PartialState().distributed_type == DistributedType.NO or not PartialState().debug:
            return function(*args, **kwargs)
        operation = f"{function.__module__}.{function.__name__}"
        if "tensor" in kwargs:
            tensor = kwargs["tensor"]
        else:
            tensor = args[0]
        if PartialState().device.type != find_device(tensor).type:
            raise DistributedOperationException(
                f"One or more of the tensors passed to {operation} were not on the {tensor.device.type} while the `Accelerator` is configured for {PartialState().device.type}. "
                f"Please move it to the {PartialState().device.type} before calling {operation}."
            )
        shapes = get_shape(tensor)
        output = gather_object([shapes])
        if output[0] is not None:
            are_same = output.count(output[0]) == len(output)
            if not are_same:
                process_shape_str = "\n  - ".join([f"Process {i}: {shape}" for i, shape in enumerate(output)])
                raise DistributedOperationException(
                    f"Cannot apply desired operation due to shape mismatches. "
                    "All shapes across devices must be valid."
                    f"\n\nOperation: `{operation}`\nInput shapes:\n  - {process_shape_str}"
                )
        return function(*args, **kwargs)

    return wrapper


def chained_operation(function):
    """
    Checks that `verify_operation` failed and if so reports a more helpful error chaining the existing
    `DistributedOperationException`.
    """

    @wraps(function)
    def wrapper(*args, **kwargs):
        try:
            return function(*args, **kwargs)
        except DistributedOperationException as e:
            operation = f"{function.__module__}.{function.__name__}"
            raise DistributedOperationException(
                f"Error found while calling `{operation}`. Please see the earlier error for more details."
            ) from e

    return wrapper


@verify_operation
def gather(tensor):
    """
    Recursively gather tensor in a nested list/tuple/dictionary of tensors from all devices.

    Args:
        tensor (nested list/tuple/dictionary of `torch.Tensor`):
            The data to gather.

    Returns:
        The same data structure as `tensor` with all tensors sent to the proper device.
    """
    if PartialState().distributed_type == DistributedType.XLA:
        return _tpu_gather(tensor)
    elif PartialState().distributed_type in TORCH_DISTRIBUTED_OPERATION_TYPES:
        return _gpu_gather(tensor)
    else:
        return tensor


def _gpu_gather_object(object: Any):
    output_objects = [None for _ in range(PartialState().num_processes)]
    torch.distributed.all_gather_object(output_objects, object)
    # all_gather_object returns a list of lists, so we need to flatten it
    return [x for y in output_objects for x in y]


def gather_object(object: Any):
    """
    Recursively gather object in a nested list/tuple/dictionary of objects from all devices.

    Args:
        object (nested list/tuple/dictionary of picklable object):
            The data to gather.

    Returns:
        The same data structure as `object` with all the objects sent to every device.
    """
    if PartialState().distributed_type == DistributedType.XLA:
        raise NotImplementedError("gather objects in TPU is not supported")
    elif PartialState().distributed_type in TORCH_DISTRIBUTED_OPERATION_TYPES:
        return _gpu_gather_object(object)
    else:
        return object


def _gpu_broadcast(data, src=0):
    def _gpu_broadcast_one(tensor, src=0):
        torch.distributed.broadcast(tensor, src=src)
        return tensor

    return recursively_apply(_gpu_broadcast_one, data, error_on_other_type=True, src=src)


def _tpu_broadcast(tensor, src=0, name="broadcast tensor"):
    if isinstance(tensor, (list, tuple)):
        return honor_type(tensor, (_tpu_broadcast(t, name=f"{name}_{i}") for i, t in enumerate(tensor)))
    elif isinstance(tensor, Mapping):
        return type(tensor)({k: _tpu_broadcast(v, name=f"{name}_{k}") for k, v in tensor.items()})
    return xm.mesh_reduce(name, tensor, lambda x: x[src])


TENSOR_TYPE_TO_INT = {
    torch.float: 1,
    torch.double: 2,
    torch.half: 3,
    torch.bfloat16: 4,
    torch.uint8: 5,
    torch.int8: 6,
    torch.int16: 7,
    torch.int32: 8,
    torch.int64: 9,
    torch.bool: 10,
}

TENSOR_INT_TO_DTYPE = {v: k for k, v in TENSOR_TYPE_TO_INT.items()}


def gather_tensor_shape(tensor):
    """
    Grabs the shape of `tensor` only available on one process and returns a tensor of its shape
    """
    # Allocate 80 bytes to store the shape
    max_tensor_dimension = 2**20
    state = PartialState()
    base_tensor = torch.empty(max_tensor_dimension, dtype=torch.int, device=state.device)

    # Since PyTorch can't just send a tensor to another GPU without
    # knowing its size, we store the size of the tensor with data
    # in an allocation
    if tensor is not None:
        shape = tensor.shape
        tensor_dtype = TENSOR_TYPE_TO_INT[tensor.dtype]
        base_tensor[: len(shape) + 1] = torch.tensor(list(shape) + [tensor_dtype], dtype=int)
    # Perform a reduction to copy the size data onto all GPUs
    base_tensor = reduce(base_tensor, reduction="sum")
    base_tensor = base_tensor[base_tensor.nonzero()]
    # The last non-zero data contains the coded dtype the source tensor is
    dtype = int(base_tensor[-1:][0])
    base_tensor = base_tensor[:-1]
    return base_tensor, dtype


def copy_tensor_to_devices(tensor=None) -> torch.Tensor:
    """
    Copies a tensor that only exists on a single device and broadcasts it to other devices. Differs from `broadcast` as
    each worker doesn't need to know its shape when used (and tensor can be `None`)

    Args:
        tensor (`torch.tensor`):
            The tensor that should be sent to all devices. Must only have it be defined on a single device, the rest
            should be `None`.
    """
    state = PartialState()
    shape, dtype = gather_tensor_shape(tensor)
    if tensor is None:
        tensor = torch.zeros(shape, dtype=TENSOR_INT_TO_DTYPE[dtype]).to(state.device)
    return reduce(tensor, reduction="sum")


@verify_operation
def broadcast(tensor, from_process: int = 0):
    """
    Recursively broadcast tensor in a nested list/tuple/dictionary of tensors to all devices.

    Args:
        tensor (nested list/tuple/dictionary of `torch.Tensor`):
            The data to gather.
        from_process (`int`, *optional*, defaults to 0):
            The process from which to send the data

    Returns:
        The same data structure as `tensor` with all tensors broadcasted to the proper device.
    """
    if PartialState().distributed_type == DistributedType.XLA:
        return _tpu_broadcast(tensor, src=from_process, name="accelerate.utils.broadcast")
    elif PartialState().distributed_type in TORCH_DISTRIBUTED_OPERATION_TYPES:
        return _gpu_broadcast(tensor, src=from_process)
    else:
        return tensor


def broadcast_object_list(object_list, from_process: int = 0):
    """
    Broadcast a list of picklable objects from one process to the others.

    Args:
        object_list (list of picklable objects):
            The list of objects to broadcast. This list will be modified inplace.
        from_process (`int`, *optional*, defaults to 0):
            The process from which to send the data.

    Returns:
        The same list containing the objects from process 0.
    """
    if PartialState().distributed_type == DistributedType.XLA:
        for i, obj in enumerate(object_list):
            object_list[i] = xm.mesh_reduce("accelerate.utils.broadcast_object_list", obj, lambda x: x[from_process])
    elif PartialState().distributed_type in TORCH_DISTRIBUTED_OPERATION_TYPES:
        torch.distributed.broadcast_object_list(object_list, src=from_process)
    return object_list


def slice_tensors(data, tensor_slice, process_index=None, num_processes=None):
    """
    Recursively takes a slice in a nested list/tuple/dictionary of tensors.

    Args:
        data (nested list/tuple/dictionary of `torch.Tensor`):
            The data to slice.
        tensor_slice (`slice`):
            The slice to take.

    Returns:
        The same data structure as `data` with all the tensors slices.
    """

    def _slice_tensor(tensor, tensor_slice):
        return tensor[tensor_slice]

    return recursively_apply(_slice_tensor, data, tensor_slice)


def concatenate(data, dim=0):
    """
    Recursively concatenate the tensors in a nested list/tuple/dictionary of lists of tensors with the same shape.
    If there is only a single batch of data, it is returned as-is.

    Args:
        data (nested list/tuple/dictionary of lists of tensors `torch.Tensor`):
            The data to concatenate.
        dim (`int`, *optional*, defaults to 0):
            The dimension on which to concatenate.

    Returns:
        The same data structure as `data` with all the tensors concatenated.
    """
    if isinstance(data[0], (tuple, list)):
        return honor_type(data[0], (concatenate([d[i] for d in data], dim=dim) for i in range(len(data[0]))))
    elif isinstance(data[0], Mapping):
        return type(data[0])({k: concatenate([d[k] for d in data], dim=dim) for k in data[0].keys()})
    elif isinstance(data[0], torch.Tensor):
        return torch.cat(data, dim=dim)
    elif isinstance(data, (tuple, list)) and len(data) == 1:
        return data[0]
    else:
        raise TypeError(f"Can only concatenate tensors but got {type(data[0])}")


class CannotPadNestedTensorWarning(UserWarning):
    pass


@chained_operation
def pad_across_processes(tensor, dim=0, pad_index=0, pad_first=False):
    """
    Recursively pad the tensors in a nested list/tuple/dictionary of tensors from all devices to the same size so they
    can safely be gathered.

    Args:
        tensor (nested list/tuple/dictionary of `torch.Tensor`):
            The data to gather.
        dim (`int`, *optional*, defaults to 0):
            The dimension on which to pad.
        pad_index (`int`, *optional*, defaults to 0):
            The value with which to pad.
        pad_first (`bool`, *optional*, defaults to `False`):
            Whether to pad at the beginning or the end.
    """

    def _pad_across_processes(tensor, dim=0, pad_index=0, pad_first=False):
        if getattr(tensor, "is_nested", False):
            warnings.warn(
                "Cannot pad nested tensors without more information. Leaving unprocessed.",
                CannotPadNestedTensorWarning,
            )
            return tensor
        if dim >= len(tensor.shape) or dim < -len(tensor.shape):
            return tensor
        # Convert negative dimensions to non-negative
        if dim < 0:
            dim += len(tensor.shape)

        # Gather all sizes
        size = torch.tensor(tensor.shape, device=tensor.device)[None]
        sizes = gather(size).cpu()
        # Then pad to the maximum size
        max_size = max(s[dim] for s in sizes)
        if max_size == tensor.shape[dim]:
            return tensor

        old_size = tensor.shape
        new_size = list(old_size)
        new_size[dim] = max_size
        new_tensor = tensor.new_zeros(tuple(new_size)) + pad_index
        if pad_first:
            indices = tuple(
                slice(max_size - old_size[dim], max_size) if i == dim else slice(None) for i in range(len(new_size))
            )
        else:
            indices = tuple(slice(0, old_size[dim]) if i == dim else slice(None) for i in range(len(new_size)))
        new_tensor[indices] = tensor
        return new_tensor

    return recursively_apply(
        _pad_across_processes, tensor, error_on_other_type=True, dim=dim, pad_index=pad_index, pad_first=pad_first
    )


def pad_input_tensors(tensor, batch_size, num_processes, dim=0):
    """
    Takes a `tensor` of arbitrary size and pads it so that it can work given `num_processes` needed dimensions.

    New tensors are just the last input repeated.

    E.g.:
      Tensor: ([3,4,4]) Num processes: 4 Expected result shape: ([4,4,4])

    """

    def _pad_input_tensors(tensor, batch_size, num_processes, dim=0):
        remainder = batch_size // num_processes
        last_inputs = batch_size - (remainder * num_processes)
        if batch_size // num_processes == 0:
            to_pad = num_processes - batch_size
        else:
            to_pad = num_processes - (batch_size // num_processes)
        # In the rare case that `to_pad` is negative,
        # we need to pad the last inputs - the found `to_pad`
        if last_inputs > to_pad & to_pad < 1:
            to_pad = last_inputs - to_pad
        old_size = tensor.shape
        new_size = list(old_size)
        new_size[0] = batch_size + to_pad
        new_tensor = tensor.new_zeros(tuple(new_size))
        indices = tuple(slice(0, old_size[dim]) if i == dim else slice(None) for i in range(len(new_size)))
        new_tensor[indices] = tensor
        return new_tensor

    return recursively_apply(
        _pad_input_tensors,
        tensor,
        error_on_other_type=True,
        batch_size=batch_size,
        num_processes=num_processes,
        dim=dim,
    )


@verify_operation
def reduce(tensor, reduction="mean", scale=1.0):
    """
    Recursively reduce the tensors in a nested list/tuple/dictionary of lists of tensors across all processes by the
    mean of a given operation.

    Args:
        tensor (nested list/tuple/dictionary of `torch.Tensor`):
            The data to reduce.
        reduction (`str`, *optional*, defaults to `"mean"`):
            A reduction method. Can be of "mean", "sum", or "none"
        scale (`float`, *optional*):
            A default scaling value to be applied after the reduce, only valid on XLA.

    Returns:
        The same data structure as `data` with all the tensors reduced.
    """

    def _reduce_across_processes(tensor, reduction="mean", scale=1.0):
        state = PartialState()
        cloned_tensor = tensor.clone()
        if state.distributed_type == DistributedType.NO:
            return cloned_tensor
        if state.distributed_type == DistributedType.XLA:
            # Some processes may have different HLO graphs than other
            # processes, for example in the breakpoint API
            # accelerator.set_trigger(). Use mark_step to make HLOs
            # the same on all processes.
            xm.mark_step()
            xm.all_reduce(xm.REDUCE_SUM, [cloned_tensor], scale)
            xm.mark_step()
        elif state.distributed_type.value in TORCH_DISTRIBUTED_OPERATION_TYPES:
            torch.distributed.all_reduce(cloned_tensor, ReduceOp.SUM)
        if reduction == "mean":
            cloned_tensor /= state.num_processes
        return cloned_tensor

    return recursively_apply(
        _reduce_across_processes, tensor, error_on_other_type=True, reduction=reduction, scale=scale
    )


def convert_to_fp32(tensor):
    """
    Recursively converts the elements nested list/tuple/dictionary of tensors in FP16/BF16 precision to FP32.

    Args:
        tensor (nested list/tuple/dictionary of `torch.Tensor`):
            The data to convert from FP16/BF16 to FP32.

    Returns:
        The same data structure as `tensor` with all tensors that were in FP16/BF16 precision converted to FP32.
    """

    def _convert_to_fp32(tensor):
        return tensor.float()

    def _is_fp16_bf16_tensor(tensor):
        return (is_torch_tensor(tensor) or hasattr(tensor, "dtype")) and tensor.dtype in (
            torch.float16,
            torch.bfloat16,
        )

    return recursively_apply(_convert_to_fp32, tensor, test_type=_is_fp16_bf16_tensor)


class ConvertOutputsToFp32:
    """
    Decorator to apply to a function outputting tensors (like a model forward pass) that ensures the outputs in FP16
    precision will be convert back to FP32.

    Args:
        model_forward (`Callable`):
            The function which outputs we want to treat.

    Returns:
        The same function as `model_forward` but with converted outputs.
    """

    def __init__(self, model_forward):
        self.model_forward = model_forward
        update_wrapper(self, model_forward)

    def __call__(self, *args, **kwargs):
        return convert_to_fp32(self.model_forward(*args, **kwargs))

    def __getstate__(self):
        raise pickle.PicklingError(
            "Cannot pickle a prepared model with automatic mixed precision, please unwrap the model with `Accelerator.unwrap_model(model)` before pickling it."
        )


def convert_outputs_to_fp32(model_forward):
    model_forward = ConvertOutputsToFp32(model_forward)

    def forward(*args, **kwargs):
        return model_forward(*args, **kwargs)

    # To act like a decorator so that it can be popped when doing `extract_model_from_parallel`
    forward.__wrapped__ = model_forward

    return forward


def find_device(data):
    """
    Finds the device on which a nested dict/list/tuple of tensors lies (assuming they are all on the same device).

    Args:
        (nested list/tuple/dictionary of `torch.Tensor`): The data we want to know the device of.
    """
    if isinstance(data, Mapping):
        for obj in data.values():
            device = find_device(obj)
            if device is not None:
                return device
    elif isinstance(data, (tuple, list)):
        for obj in data:
            device = find_device(obj)
            if device is not None:
                return device
    elif isinstance(data, torch.Tensor):
        return data.device


@contextmanager
def GatheredParameters(params, modifier_rank=None, fwd_module=None, enabled=True):
    """
    Wrapper around `deepspeed.runtime.zero.GatheredParameters`, but if Zero-3 is not enabled, will be a no-op context
    manager.
    """
    # We need to use the `AcceleratorState` here since it has access to the deepspeed plugin
    if AcceleratorState().distributed_type != DistributedType.DEEPSPEED or (
        AcceleratorState().deepspeed_plugin is not None
        and not AcceleratorState().deepspeed_plugin.is_zero3_init_enabled()
    ):
        gather_param_context = nullcontext()
    else:
        import deepspeed

        gather_param_context = deepspeed.zero.GatheredParameters(
            params, modifier_rank=modifier_rank, fwd_module=fwd_module, enabled=enabled
        )
    with gather_param_context:
        yield


================================================
FILE: src/accelerate/utils/other.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import collections
import platform
import re
import socket
from codecs import encode
from collections import OrderedDict
from functools import partial, reduce
from types import MethodType
from typing import Optional

import numpy as np
import torch
from packaging.version import Version
from safetensors.torch import save_file as safe_save_file

from ..commands.config.default import write_basic_config  # noqa: F401
from ..logging import get_logger
from ..state import PartialState
from .constants import FSDP_PYTORCH_VERSION
from .dataclasses import DistributedType
from .imports import (
    is_deepspeed_available,
    is_numpy_available,
    is_torch_distributed_available,
    is_torch_xla_available,
    is_weights_only_available,
)
from .modeling import id_tensor_storage
from .transformer_engine import convert_model
from .versions import is_torch_version


logger = get_logger(__name__)


if is_torch_xla_available():
    import torch_xla.core.xla_model as xm


def is_compiled_module(module: torch.nn.Module) -> bool:
    """
    Check whether the module was compiled with torch.compile()
    """
    if not hasattr(torch, "_dynamo"):
        return False

    return isinstance(module, torch._dynamo.eval_frame.OptimizedModule)


def has_compiled_regions(module: torch.nn.Module) -> bool:
    """
    Check whether the module has submodules that were compiled with `torch.compile()`.
    """
    if not hasattr(torch, "_dynamo"):
        return False

    if module._modules:
        for submodule in module.modules():
            if isinstance(submodule, torch._dynamo.eval_frame.OptimizedModule):
                return True

    return False


def is_repeated_blocks(module: torch.nn.Module) -> bool:
    """
    Check whether the module is a repeated block, i.e. `torch.nn.ModuleList` with all children of the same class. This
    is useful to determine whether we should apply regional compilation to the module.
    """

    return (
        isinstance(module, torch.nn.ModuleList)
        and len(module) > 0
        and all(isinstance(m, module[0].__class__) for m in module)
    )


def has_repeated_blocks(module: torch.nn.Module) -> bool:
    """
    Check whether the module has repeated blocks, i.e. `torch.nn.ModuleList` with all children of the same class, at
    any level of the module hierarchy. This is useful to determine whether we should apply regional compilation to the
    module.
    """
    if module._modules:
        for submodule in module.modules():
            if is_repeated_blocks(submodule):
                return True

    return False


def compile_regions(module: torch.nn.Module, **compile_kwargs) -> torch.nn.Module:
    """
    Performs regional compilation where we target repeated blocks of the same class and compile them sequentially to
    hit the compiler's cache. For example, in `GPT2LMHeadModel`, the repeated block/class is `GPT2Block`, and can be
    accessed as `model.transformer.h[0]`. The rest of the model (e.g. model.lm_head) is compiled separately.

    This allows us to speed up the compilation overhead / cold start of models like LLMs and Transformers in general.
    See https://pytorch.org/tutorials/recipes/regional_compilation.html for more details.

    Args:
        module (`torch.nn.Module`):
            The model to compile.
        **compile_kwargs:
            Additional keyword arguments to pass to `torch.compile()`.

    Returns:
        `torch.nn.Module`: A new instance of the model with some compiled regions.

    Example:
    ```python
    >>> from accelerate.utils import compile_regions
    >>> from transformers import AutoModelForCausalLM

    >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
    >>> compiled_model = compile_regions(model, mode="reduce-overhead")
    >>> compiled_model.transformer.h[0]
    OptimizedModule(
        (_orig_mod): GPT2Block(
                (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
                (attn): GPT2Attention(
                (c_attn): Conv1D(nf=2304, nx=768)
                (c_proj): Conv1D(nf=768, nx=768)
                (attn_dropout): Dropout(p=0.1, inplace=False)
                (resid_dropout): Dropout(p=0.1, inplace=False)
            )
            (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (mlp): GPT2MLP(
                (c_fc): Conv1D(nf=3072, nx=768)
                (c_proj): Conv1D(nf=768, nx=3072)
                (act): NewGELUActivation()
                (dropout): Dropout(p=0.1, inplace=False)
            )
        )
    )
    ```
    """

    def _compile_regions(module: torch.nn.Module, **compile_kwargs) -> torch.nn.Module:
        if is_repeated_blocks(module):
            new_module = torch.nn.ModuleList()
            for submodule in module:
                new_module.append(torch.compile(submodule, **compile_kwargs))
        elif has_repeated_blocks(module):
            new_module = module.__class__.__new__(module.__class__)
            new_module.__dict__.update(module.__dict__)
            new_module._modules = {}
            for name, submodule in module.named_children():
                new_module.add_module(name, _compile_regions(submodule, **compile_kwargs))
        else:
            new_module = torch.compile(module, **compile_kwargs)

        return new_module

    new_module = _compile_regions(module, **compile_kwargs)

    if "_orig_mod" not in new_module.__dict__:
        # Keeps a reference to the original module to decompile/unwrap it later
        new_module.__dict__["_orig_mod"] = module

    return new_module


def compile_regions_deepspeed(module: torch.nn.Module, **compile_kwargs):
    """
    Performs regional compilation the same way as `compile_regions`, but specifically for `DeepSpeedEngine.module`.
    Since the model is wrapped in a `DeepSpeedEngine` and has many added hooks, offloaded parameters, etc that
    `torch.compile(...)` interferes with, version of trgional compilation uses the inplace `module.compile()` method
    instead.

    Args:
        module (`torch.nn.Module`):
            The model to compile.
        **compile_kwargs:
            Additional keyword arguments to pass to `module.compile()`.
    """

    if is_repeated_blocks(module):
        for submodule in module:
            submodule.compile(**compile_kwargs)
    elif has_repeated_blocks(module):
        for child in module.children():
            compile_regions_deepspeed(child, **compile_kwargs)
    else:  # leaf node
        module.compile(**compile_kwargs)


def model_has_dtensor(model: torch.nn.Module) -> bool:
    """
    Check if the model has DTensor parameters.

    Args:
        model (`torch.nn.Module`):
            The model to check.

    Returns:
        `bool`: Whether the model has DTensor parameters.
    """
    if is_torch_version(">=", "2.5.0"):
        from torch.distributed.tensor import DTensor
    else:
        # from torch 2.0.0 (oldest supported accelerate torch version), DTensor is in torch.distributed._tensor
        from torch.distributed._tensor import DTensor

    return any(isinstance(p, DTensor) for p in model.parameters())


def extract_model_from_parallel(
    model, keep_fp32_wrapper: bool = True, keep_torch_compile: bool = True, recursive: bool = False
):
    """
    Extract a model from its distributed containers.

    Args:
        model (`torch.nn.Module`):
            The model to extract.
        keep_fp32_wrapper (`bool`, *optional*):
            Whether to remove mixed precision hooks from the model.
        keep_torch_compile (`bool`, *optional*):
            Whether to unwrap compiled model.
        recursive (`bool`, *optional*, defaults to `False`):
            Whether to recursively extract all cases of `module.module` from `model` as well as unwrap child sublayers
            recursively, not just the top-level distributed containers.

    Returns:
        `torch.nn.Module`: The extracted model.
    """
    options = (torch.nn.parallel.DistributedDataParallel, torch.nn.DataParallel)

    is_compiled = is_compiled_module(model)
    has_compiled = has_compiled_regions(model)

    compiled_model = None
    if is_compiled:
        compiled_model = model
        model = model._orig_mod
    elif has_compiled:
        # Skip if top-level not compiled, subs stay wrapped
        if "_orig_mod" in model.__dict__:
            compiled_model = model
            model = model.__dict__["_orig_mod"]

    if is_deepspeed_available():
        from deepspeed import DeepSpeedEngine

        options += (DeepSpeedEngine,)

    if is_torch_version(">=", FSDP_PYTORCH_VERSION) and is_torch_distributed_available():
        from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP

        options += (FSDP,)

    while isinstance(model, options):
        model = model.module

    if recursive:
        # This is needed in cases such as using FSDPv2 on XLA
        def _recursive_unwrap(module):
            # Wrapped modules are standardly wrapped as `module`, similar to the cases earlier
            # with DDP, DataParallel, DeepSpeed, and FSDP
            if hasattr(module, "module"):
                unwrapped_module = _recursive_unwrap(module.module)
            else:
                unwrapped_module = module
            # Next unwrap child sublayers recursively
            for name, child in unwrapped_module.named_children():
                setattr(unwrapped_module, name, _recursive_unwrap(child))
            return unwrapped_module

        # Start with top-level
        model = _recursive_unwrap(model)

    if not keep_fp32_wrapper:
        forward = model.forward
        original_forward = model.__dict__.pop("_original_forward", None)
        if original_forward is not None:
            while hasattr(forward, "__wrapped__"):
                forward = forward.__wrapped__
                if forward == original_forward:
                    break
            model.forward = MethodType(forward, model)
        if getattr(model, "_converted_to_transformer_engine", False):
            convert_model(model, to_transformer_engine=False)

    if keep_torch_compile and compiled_model is not None:
        if is_compiled:
            compiled_model._orig_mod = model
            model = compiled_model
        elif has_compiled:
            compiled_model.__dict__["_orig_mod"] = model
            model = compiled_model

    return model


def wait_for_everyone():
    """
    Introduces a blocking point in the script, making sure all processes have reached this point before continuing.

    <Tip warning={true}>

    Make sure all processes will reach this instruction otherwise one of your processes will hang forever.

    </Tip>
    """
    PartialState().wait_for_everyone()


def clean_state_dict_for_safetensors(state_dict: dict):
    """
    Cleans the state dictionary from a model and removes tensor aliasing if present.

    Args:
        state_dict (`dict`):
            The state dictionary from a model
    """
    ptrs = collections.defaultdict(list)
    # When bnb serialization is used, weights in state dict can be strings
    for name, tensor in state_dict.items():
        if not isinstance(tensor, str):
            ptrs[id_tensor_storage(tensor)].append(name)

    # These are all pointers of tensors with shared memory
    shared_ptrs = {ptr: names for ptr, names in ptrs.items() if len(names) > 1}
    warn_names = set()
    for names in shared_ptrs.values():
        # When not all duplicates have been cleaned, we still remove those keys but put a clear warning.
        # If the link between tensors was done at runtime then `from_pretrained` will not get
        # the key back leading to random tensor. A proper warning will be shown
        # during reload (if applicable), but since the file is not necessarily compatible with
        # the config, better show a proper warning.
        found_names = [name for name in names if name in state_dict]
        warn_names.update(found_names[1:])
        for name in found_names[1:]:
            del state_dict[name]
    if len(warn_names) > 0:
        logger.warning(
            f"Removed shared tensor {warn_names} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading",
        )
    state_dict = {k: v.contiguous() if isinstance(v, torch.Tensor) else v for k, v in state_dict.items()}
    return state_dict


def save(obj, f, save_on_each_node: bool = False, safe_serialization: bool = False):
    """
    Save the data to disk. Use in place of `torch.save()`.

    Args:
        obj:
            The data to save
        f:
            The file (or file-like object) to use to save the data
        save_on_each_node (`bool`, *optional*, defaults to `False`):
            Whether to only save on the global main process
        safe_serialization (`bool`, *optional*, defaults to `False`):
            Whether to save `obj` using `safetensors` or the traditional PyTorch way (that uses `pickle`).
    """
    # When TorchXLA is enabled, it's necessary to transfer all data to the CPU before saving.
    # Another issue arises with `id_tensor_storage`, which treats all XLA tensors as identical.
    # If tensors remain on XLA, calling `clean_state_dict_for_safetensors` will result in only
    # one XLA tensor remaining.
    if PartialState().distributed_type == DistributedType.XLA:
        obj = xm._maybe_convert_to_cpu(obj)
    # Check if it's a model and remove duplicates
    if safe_serialization:
        save_func = partial(safe_save_file, metadata={"format": "pt"})
        if isinstance(obj, OrderedDict):
            obj = clean_state_dict_for_safetensors(obj)
    else:
        save_func = torch.save

    if PartialState().is_main_process and not save_on_each_node:
        save_func(obj, f)
    elif PartialState().is_local_main_process and save_on_each_node:
        save_func(obj, f)


# The following are considered "safe" globals to reconstruct various types of objects when using `weights_only=True`
# These should be added and then removed after loading in the file
np_core = np._core if is_numpy_available("2.0.0") else np.core
TORCH_SAFE_GLOBALS = [
    # numpy arrays are just numbers, not objects, so we can reconstruct them safely
    np_core.multiarray._reconstruct,
    np.ndarray,
    # The following are needed for the RNG states
    encode,
    np.dtype,
]

if is_numpy_available("1.25.0"):
    TORCH_SAFE_GLOBALS.append(np.dtypes.UInt32DType)


def load(f, map_location=None, **kwargs):
    """
    Compatible drop-in replacement of `torch.load()` which allows for `weights_only` to be used if `torch` version is
    2.4.0 or higher. Otherwise will ignore the kwarg.

    Will also add (and then remove) an exception for numpy arrays

    Args:
        f:
            The file (or file-like object) to use to load the data
        map_location:
            a function, `torch.device`, string or a dict specifying how to remap storage locations
        **kwargs:
            Additional keyword arguments to pass to `torch.load()`.
    """
    try:
        if is_weights_only_available():
            old_safe_globals = torch.serialization.get_safe_globals()
            if "weights_only" not in kwargs:
                kwargs["weights_only"] = True
            torch.serialization.add_safe_globals(TORCH_SAFE_GLOBALS)
        else:
            kwargs.pop("weights_only", None)
        loaded_obj = torch.load(f, map_location=map_location, **kwargs)
    finally:
        if is_weights_only_available():
            torch.serialization.clear_safe_globals()
            if old_safe_globals:
                torch.serialization.add_safe_globals(old_safe_globals)
    return loaded_obj


def get_pretty_name(obj):
    """
    Gets a pretty name from `obj`.
    """
    if not hasattr(obj, "__qualname__") and not hasattr(obj, "__name__"):
        obj = getattr(obj, "__class__", obj)
    if hasattr(obj, "__qualname__"):
        return obj.__qualname__
    if hasattr(obj, "__name__"):
        return obj.__name__
    return str(obj)


def merge_dicts(source, destination):
    """
    Recursively merges two dictionaries.

    Args:
        source (`dict`): The dictionary to merge into `destination`.
        destination (`dict`): The dictionary to merge `source` into.
    """
    for key, value in source.items():
        if isinstance(value, dict):
            node = destination.setdefault(key, {})
            merge_dicts(value, node)
        else:
            destination[key] = value

    return destination


def is_port_in_use(port: Optional[int] = None) -> bool:
    """
    Checks if a port is in use on `localhost`. Useful for checking if multiple `accelerate launch` commands have been
    run and need to see if the port is already in use.
    """
    if port is None:
        port = 29500
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        return s.connect_ex(("localhost", port)) == 0


def get_free_port() -> int:
    """
    Gets a free port on `localhost`. Useful for automatic port selection when port 0 is specified in distributed
    training scenarios.

    Returns:
        int: An available port number
    """
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        s.bind(("", 0))  # bind to port 0 for OS to assign a free port
        return s.getsockname()[1]


def convert_bytes(size):
    "Converts `size` from bytes to the largest possible unit"
    for x in ["bytes", "KB", "MB", "GB", "TB"]:
        if size < 1024.0:
            return f"{round(size, 2)} {x}"
        size /= 1024.0

    return f"{round(size, 2)} PB"


def check_os_kernel():
    """Warns if the kernel version is below the recommended minimum on Linux."""
    # see issue #1929
    info = platform.uname()
    system = info.system
    if system != "Linux":
        return

    _, version, *_ = re.split(r"(\d+\.\d+\.\d+)", info.release)
    min_version = "5.5.0"
    if Version(version) < Version(min_version):
        msg = (
            f"Detected kernel version {version}, which is below the recommended minimum of {min_version}; this can "
            "cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher."
        )
        logger.warning(msg, main_process_only=True)


def recursive_getattr(obj, attr: str):
    """
    Recursive `getattr`.

    Args:
        obj:
            A class instance holding the attribute.
        attr (`str`):
            The attribute that is to be retrieved, e.g. 'attribute1.attribute2'.
    """

    def _getattr(obj, attr):
        return getattr(obj, attr)

    return reduce(_getattr, [obj] + attr.split("."))


def get_module_children_bottom_up(model: torch.nn.Module, return_fqns: bool = False) -> list[torch.nn.Module]:
    """Traverse the model in bottom-up order and return the children modules in that order.

    Args:
        model (`torch.nn.Module`): the model to get the children of

    Returns:
        `list[torch.nn.Module]`: a list of children modules of `model` in bottom-up order. The last element is the
        `model` itself.
    """
    top = model if not return_fqns else ("", model)
    stack = [top]
    ordered_modules = []
    while stack:
        current_module = stack.pop()
        if return_fqns:
            current_module_name, current_module = current_module
        for name, attr in current_module.named_children():
            if isinstance(attr, torch.nn.Module):
                if return_fqns:
                    child_name = current_module_name + "." + name if current_module_name else name
                    stack.append((child_name, attr))
                else:
                    stack.append(attr)
        if return_fqns:
            ordered_modules.append((current_module_name, current_module))
        else:
            ordered_modules.append(current_module)
    return ordered_modules[::-1]


================================================
FILE: src/accelerate/utils/random.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import random
from typing import Optional, Union

import numpy as np
import torch

from ..state import AcceleratorState
from .constants import CUDA_DISTRIBUTED_TYPES
from .dataclasses import DistributedType, RNGType
from .imports import (
    is_hpu_available,
    is_mlu_available,
    is_musa_available,
    is_neuron_available,
    is_npu_available,
    is_sdaa_available,
    is_torch_xla_available,
    is_xpu_available,
)


if is_torch_xla_available():
    import torch_xla.core.xla_model as xm


def set_seed(seed: int, device_specific: bool = False, deterministic: bool = False):
    """
    Helper function for reproducible behavior to set the seed in `random`, `numpy`, `torch`.

    Args:
        seed (`int`):
            The seed to set.
        device_specific (`bool`, *optional*, defaults to `False`):
            Whether to differ the seed on each device slightly with `self.process_index`.
        deterministic (`bool`, *optional*, defaults to `False`):
            Whether to use deterministic algorithms where available. Can slow down training.
    """
    if device_specific:
        seed += AcceleratorState().process_index
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if is_xpu_available():
        torch.xpu.manual_seed_all(seed)
    elif is_npu_available():
        torch.npu.manual_seed_all(seed)
    elif is_mlu_available():
        torch.mlu.manual_seed_all(seed)
    elif is_sdaa_available():
        torch.sdaa.manual_seed_all(seed)
    elif is_musa_available():
        torch.musa.manual_seed_all(seed)
    elif is_hpu_available():
        torch.hpu.manual_seed_all(seed)
    elif is_neuron_available():
        torch.neuron.manual_seed_all(seed)
    else:
        torch.cuda.manual_seed_all(seed)
    # ^^ safe to call this function even if cuda is not available
    if is_torch_xla_available():
        xm.set_rng_state(seed)

    if deterministic:
        torch.use_deterministic_algorithms(True)


def synchronize_rng_state(rng_type: Optional[RNGType] = None, generator: Optional[torch.Generator] = None):
    # Get the proper rng state
    if rng_type == RNGType.TORCH:
        rng_state = torch.get_rng_state()
    elif rng_type == RNGType.CUDA:
        rng_state = torch.cuda.get_rng_state()
    elif rng_type == RNGType.XLA:
        assert is_torch_xla_available(), "Can't synchronize XLA seeds as torch_xla is unavailable."
        rng_state = torch.tensor(xm.get_rng_state())
    elif rng_type == RNGType.NPU:
        assert is_npu_available(), "Can't synchronize NPU seeds on an environment without NPUs."
        rng_state = torch.npu.get_rng_state()
    elif rng_type == RNGType.MLU:
        assert is_mlu_available(), "Can't synchronize MLU seeds on an environment without MLUs."
        rng_state = torch.mlu.get_rng_state()
    elif rng_type == RNGType.SDAA:
        assert is_sdaa_available(), "Can't synchronize SDAA seeds on an environment without SDAAs."
        rng_state = torch.sdaa.get_rng_state()
    elif rng_type == RNGType.MUSA:
        assert is_musa_available(), "Can't synchronize MUSA seeds on an environment without MUSAs."
        rng_state = torch.musa.get_rng_state()
    elif rng_type == RNGType.XPU:
        assert is_xpu_available(), "Can't synchronize XPU seeds on an environment without XPUs."
        rng_state = torch.xpu.get_rng_state()
    elif rng_type == RNGType.HPU:
        assert is_hpu_available(), "Can't synchronize HPU seeds on an environment without HPUs."
        rng_state = torch.hpu.get_rng_state()
    elif rng_type == RNGType.NEURON:
        assert is_neuron_available(), "Can't synchronize Neuron seeds on an environment without Neuron Cores."
        rng_state = torch.neuron.get_rng_state()
    elif rng_type == RNGType.GENERATOR:
        assert generator is not None, "Need a generator to synchronize its seed."
        rng_state = generator.get_state()

    # Broadcast the rng state from device 0 to other devices
    state = AcceleratorState()
    if state.distributed_type == DistributedType.XLA:
        rng_state = rng_state.to(xm.xla_device())
        xm.collective_broadcast([rng_state])
        xm.mark_step()
        rng_state = rng_state.cpu()
    elif (
        state.distributed_type in CUDA_DISTRIBUTED_TYPES
        or state.distributed_type == DistributedType.MULTI_MLU
        or state.distributed_type == DistributedType.MULTI_SDAA
        or state.distributed_type == DistributedType.MULTI_MUSA
        or state.distributed_type == DistributedType.MULTI_NPU
        or state.distributed_type == DistributedType.MULTI_XPU
        or state.distributed_type == DistributedType.MULTI_HPU
        or state.distributed_type == DistributedType.MULTI_NEURON
    ):
        rng_state = rng_state.to(state.device)
        torch.distributed.broadcast(rng_state, 0)
        rng_state = rng_state.cpu()
    elif state.distributed_type == DistributedType.MULTI_CPU:
        torch.distributed.broadcast(rng_state, 0)

    # Set the broadcast rng state
    if rng_type == RNGType.TORCH:
        torch.set_rng_state(rng_state)
    elif rng_type == RNGType.CUDA:
        torch.cuda.set_rng_state(rng_state)
    elif rng_type == RNGType.NPU:
        torch.npu.set_rng_state(rng_state)
    elif rng_type == RNGType.MLU:
        torch.mlu.set_rng_state(rng_state)
    elif rng_type == RNGType.SDAA:
        torch.sdaa.set_rng_state(rng_state)
    elif rng_type == RNGType.MUSA:
        torch.musa.set_rng_state(rng_state)
    elif rng_type == RNGType.XPU:
        torch.xpu.set_rng_state(rng_state)
    elif rng_type == RNGType.HPU:
        torch.hpu.set_rng_state(rng_state)
    elif rng_type == RNGType.NEURON:
        torch.neuron.set_rng_state(rng_state)
    elif rng_type == RNGType.XLA:
        xm.set_rng_state(rng_state.item())
    elif rng_type == RNGType.GENERATOR:
        generator.set_state(rng_state)


def synchronize_rng_states(rng_types: list[Union[str, RNGType]], generator: Optional[torch.Generator] = None):
    for rng_type in rng_types:
        synchronize_rng_state(RNGType(rng_type), generator=generator)


================================================
FILE: src/accelerate/utils/rich.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .imports import is_rich_available


if is_rich_available():
    from rich.traceback import install

    install(show_locals=False)

else:
    raise ModuleNotFoundError("To use the rich extension, install rich with `pip install rich`")


================================================
FILE: src/accelerate/utils/torch_xla.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import importlib.metadata
import subprocess
import sys


def install_xla(upgrade: bool = False):
    """
    Helper function to install appropriate xla wheels based on the `torch` version in Google Colaboratory.

    Args:
        upgrade (`bool`, *optional*, defaults to `False`):
            Whether to upgrade `torch` and install the latest `torch_xla` wheels.

    Example:

    ```python
    >>> from accelerate.utils import install_xla

    >>> install_xla(upgrade=True)
    ```
    """
    in_colab = False
    if "IPython" in sys.modules:
        in_colab = "google.colab" in str(sys.modules["IPython"].get_ipython())

    if in_colab:
        if upgrade:
            torch_install_cmd = ["pip", "install", "-U", "torch"]
            subprocess.run(torch_install_cmd, check=True)
        # get the current version of torch
        torch_version = importlib.metadata.version("torch")
        torch_version_trunc = torch_version[: torch_version.rindex(".")]
        xla_wheel = f"https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch_xla-{torch_version_trunc}-cp37-cp37m-linux_x86_64.whl"
        xla_install_cmd = ["pip", "install", xla_wheel]
        subprocess.run(xla_install_cmd, check=True)
    else:
        raise RuntimeError("`install_xla` utility works only on google colab.")


================================================
FILE: src/accelerate/utils/tqdm.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from .imports import is_tqdm_available


if is_tqdm_available():
    from tqdm.auto import tqdm as _tqdm

from ..state import PartialState


def tqdm(*args, main_process_only: bool = True, **kwargs):
    """
    Wrapper around `tqdm.tqdm` that optionally displays only on the main process.

    Args:
        main_process_only (`bool`, *optional*):
            Whether to display the progress bar only on the main process
    """
    if not is_tqdm_available():
        raise ImportError("Accelerate's `tqdm` module requires `tqdm` to be installed. Please run `pip install tqdm`.")
    if len(args) > 0 and isinstance(args[0], bool):
        raise ValueError(
            "Passing `True` or `False` as the first argument to Accelerate's `tqdm` wrapper is unsupported. "
            "Please use the `main_process_only` keyword argument instead."
        )
    disable = kwargs.pop("disable", False)
    if main_process_only and not disable:
        disable = PartialState().local_process_index != 0
    return _tqdm(*args, **kwargs, disable=disable)


================================================
FILE: src/accelerate/utils/transformer_engine.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from types import MethodType

import torch.nn as nn

from .imports import is_hpu_available, is_transformer_engine_available
from .operations import GatheredParameters


# Do not import `transformer_engine` at package level to avoid potential issues


def convert_model(model, to_transformer_engine=True, _convert_linear=True, _convert_ln=True):
    """
    Recursively converts the linear and layernorm layers of a model to their `transformers_engine` counterpart.
    """
    if not is_transformer_engine_available():
        raise ImportError("Using `convert_model` requires transformer_engine to be installed.")

    if is_hpu_available():
        import intel_transformer_engine as te

        if not hasattr(te, "LayerNorm"):
            # HPU does not have a LayerNorm implementation in TE
            te.LayerNorm = nn.LayerNorm
    else:
        import transformer_engine.pytorch as te

    for name, module in model.named_children():
        if isinstance(module, nn.Linear) and to_transformer_engine and _convert_linear:
            has_bias = module.bias is not None
            params_to_gather = [module.weight]
            if has_bias:
                params_to_gather.append(module.bias)

            with GatheredParameters(params_to_gather, modifier_rank=0):
                if any(p % 16 != 0 for p in module.weight.shape):
                    return
                te_module = te.Linear(
                    module.in_features, module.out_features, bias=has_bias, params_dtype=module.weight.dtype
                )
                te_module.weight.copy_(module.weight)
                if has_bias:
                    te_module.bias.copy_(module.bias)

                setattr(model, name, te_module)
        # Note: @xrsrke (Phuc) found that te.LayerNorm doesn't have any real memory savings or speedups over nn.LayerNorm
        elif isinstance(module, nn.LayerNorm) and to_transformer_engine and _convert_ln:
            with GatheredParameters([module.weight, module.bias], modifier_rank=0):
                has_bias = module.bias is not None
                te_module = te.LayerNorm(module.normalized_shape[0], eps=module.eps, params_dtype=module.weight.dtype)
                te_module.weight.copy_(module.weight)
                if has_bias:
                    te_module.bias.copy_(module.bias)

            setattr(model, name, te_module)
        elif isinstance(module, te.Linear) and not to_transformer_engine and _convert_linear:
            has_bias = module.bias is not None
            new_module = nn.Linear(
                module.in_features, module.out_features, bias=has_bias, params_dtype=module.weight.dtype
            )
            new_module.weight.copy_(module.weight)
            if has_bias:
                new_module.bias.copy_(module.bias)

            setattr(model, name, new_module)
        elif isinstance(module, te.LayerNorm) and not to_transformer_engine and _convert_ln:
            new_module = nn.LayerNorm(module.normalized_shape[0], eps=module.eps, params_dtype=module.weight.dtype)
            new_module.weight.copy_(module.weight)
            new_module.bias.copy_(module.bias)

            setattr(model, name, new_module)
        else:
            convert_model(
                module,
                to_transformer_engine=to_transformer_engine,
                _convert_linear=_convert_linear,
                _convert_ln=_convert_ln,
            )


def has_transformer_engine_layers(model):
    """
    Returns whether a given model has some `transformer_engine` layer or not.
    """
    if not is_transformer_engine_available():
        raise ImportError("Using `has_transformer_engine_layers` requires transformer_engine to be installed.")

    if is_hpu_available():
        import intel_transformer_engine as te

        module_cls_to_check = te.Linear
    else:
        import transformer_engine.pytorch as te

        module_cls_to_check = (te.LayerNorm, te.Linear, te.TransformerLayer)

    for m in model.modules():
        if isinstance(m, module_cls_to_check):
            return True

    return False


def contextual_fp8_autocast(model_forward, fp8_recipe, use_during_eval=False):
    """
    Wrapper for a model's forward method to apply FP8 autocast. Is context aware, meaning that by default it will
    disable FP8 autocast during eval mode, which is generally better for more accurate metrics.
    """
    if not is_transformer_engine_available():
        raise ImportError("Using `contextual_fp8_autocast` requires transformer_engine to be installed.")

    if is_hpu_available():
        from intel_transformer_engine import fp8_autocast
    else:
        from transformer_engine.pytorch import fp8_autocast

    def forward(self, *args, **kwargs):
        enabled = use_during_eval or self.training
        with fp8_autocast(enabled=enabled, fp8_recipe=fp8_recipe):
            return model_forward(*args, **kwargs)

    # To act like a decorator so that it can be popped when doing `extract_model_from_parallel`
    forward.__wrapped__ = model_forward

    return forward


def apply_fp8_autowrap(model, fp8_recipe_handler):
    """
    Applies FP8 context manager to the model's forward method
    """
    if not is_transformer_engine_available():
        raise ImportError("Using `apply_fp8_autowrap` requires transformer_engine to be installed.")

    if is_hpu_available():
        import intel_transformer_engine.recipe as te_recipe

        is_fp8_block_scaling_available = False
        message = "MXFP8 block scaling is not available on HPU."

    else:
        import transformer_engine.common.recipe as te_recipe
        from transformer_engine.pytorch.fp8 import check_mxfp8_support

        is_fp8_block_scaling_available, message = check_mxfp8_support()

    kwargs = fp8_recipe_handler.to_kwargs() if fp8_recipe_handler is not None else {}
    if "fp8_format" in kwargs:
        kwargs["fp8_format"] = getattr(te_recipe.Format, kwargs["fp8_format"])
    use_during_eval = kwargs.pop("use_autocast_during_eval", False)
    use_mxfp8_block_scaling = kwargs.pop("use_mxfp8_block_scaling", False)

    if use_mxfp8_block_scaling and not is_fp8_block_scaling_available:
        raise ValueError(f"MXFP8 block scaling is not available: {message}")

    if use_mxfp8_block_scaling:
        if "amax_compute_algo" in kwargs:
            raise ValueError("`amax_compute_algo` is not supported for MXFP8 block scaling.")
        if "amax_history_len" in kwargs:
            raise ValueError("`amax_history_len` is not supported for MXFP8 block scaling.")
        fp8_recipe = te_recipe.MXFP8BlockScaling(**kwargs)
    else:
        fp8_recipe = te_recipe.DelayedScaling(**kwargs)

    new_forward = contextual_fp8_autocast(model.forward, fp8_recipe, use_during_eval)

    if hasattr(model.forward, "__func__"):
        model.forward = MethodType(new_forward, model)
    else:
        model.forward = new_forward

    return model


================================================
FILE: src/accelerate/utils/versions.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import importlib.metadata
from typing import Union

from packaging.version import Version, parse

from .constants import STR_OPERATION_TO_FUNC


torch_version = parse(importlib.metadata.version("torch"))


def compare_versions(library_or_version: Union[str, Version], operation: str, requirement_version: str):
    """
    Compares a library version to some requirement using a given operation.

    Args:
        library_or_version (`str` or `packaging.version.Version`):
            A library name or a version to check.
        operation (`str`):
            A string representation of an operator, such as `">"` or `"<="`.
        requirement_version (`str`):
            The version to compare the library version against
    """
    if operation not in STR_OPERATION_TO_FUNC.keys():
        raise ValueError(f"`operation` must be one of {list(STR_OPERATION_TO_FUNC.keys())}, received {operation}")
    operation = STR_OPERATION_TO_FUNC[operation]
    if isinstance(library_or_version, str):
        library_or_version = parse(importlib.metadata.version(library_or_version))
    return operation(library_or_version, parse(requirement_version))


def is_torch_version(operation: str, version: str):
    """
    Compares the current PyTorch version to a given reference with an operation.

    Args:
        operation (`str`):
            A string representation of an operator, such as `">"` or `"<="`
        version (`str`):
            A string version of PyTorch
    """
    return compare_versions(torch_version, operation, version)


================================================
FILE: tests/__init__.py
================================================
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: tests/deepspeed/ds_config_zero2.json
================================================
{
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },
    "bf16": {
        "enabled": "auto"
    },
    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "weight_decay": "auto",
            "torch_adam": true,
            "adam_w_mode": true
        }
    },
    "scheduler": {
        "type": "WarmupLR",
        "params": {
            "warmup_min_lr": "auto",
            "warmup_max_lr": "auto",
            "warmup_num_steps": "auto"
        }
    },
    "zero_optimization": {
        "stage": 2,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": true
        },
        "allgather_partitions": true,
        "allgather_bucket_size": 2e8,
        "overlap_comm": true,
        "reduce_scatter": true,
        "reduce_bucket_size": "auto",
        "contiguous_gradients": true
    },
    "gradient_accumulation_steps": 1,
    "gradient_clipping": "auto",
    "steps_per_print": 2000,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false
}

================================================
FILE: tests/deepspeed/ds_config_zero2_model_only.json
================================================
{
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },
    "bf16": {
        "enabled": "auto"
    },
    "zero_optimization": {
        "stage": 2,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": true
        },
        "allgather_partitions": true,
        "allgather_bucket_size": 2e8,
        "overlap_comm": true,
        "reduce_scatter": true,
        "reduce_bucket_size": "auto",
        "contiguous_gradients": true
    },
    "gradient_accumulation_steps": 1,
    "gradient_clipping": "auto",
    "steps_per_print": 2000,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false
}

================================================
FILE: tests/deepspeed/ds_config_zero3.json
================================================
{
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },
    "bf16": {
        "enabled": "auto"
    },
    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "weight_decay": "auto",
            "torch_adam": true,
            "adam_w_mode": true
        }
    },
    "scheduler": {
        "type": "WarmupLR",
        "params": {
            "warmup_min_lr": "auto",
            "warmup_max_lr": "auto",
            "warmup_num_steps": "auto"
        }
    },
    "zero_optimization": {
        "stage": 3,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": true
        },
        "offload_param": {
            "device": "cpu",
            "pin_memory": true
        },
        "overlap_comm": true,
        "contiguous_gradients": true,
        "sub_group_size": 1e9,
        "reduce_bucket_size": "auto",
        "stage3_prefetch_bucket_size": "auto",
        "stage3_param_persistence_threshold": "auto",
        "stage3_max_live_parameters": 1e9,
        "stage3_max_reuse_distance": 1e9,
        "stage3_gather_16bit_weights_on_model_save": "auto"
    },
    "gradient_accumulation_steps": 1,
    "gradient_clipping": "auto",
    "steps_per_print": 2000,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false
}

================================================
FILE: tests/deepspeed/ds_config_zero3_model_only.json
================================================
{
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },
    "bf16": {
        "enabled": "auto"
    },
    "zero_optimization": {
        "stage": 3,
        "offload_param": {
            "device": "cpu",
            "pin_memory": true
        },
        "overlap_comm": true,
        "sub_group_size": 1e9,
        "reduce_bucket_size": 1e9,
        "stage3_prefetch_bucket_size": 1e9,
        "stage3_param_persistence_threshold": 1e9,
        "stage3_max_live_parameters": 1e9,
        "stage3_max_reuse_distance": 1e9,
        "stage3_gather_16bit_weights_on_model_save": true
    },
    "train_micro_batch_size_per_gpu": 1
}

================================================
FILE: tests/deepspeed/test_alst_ulysses_sp.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from parameterized import parameterized

from accelerate.test_utils.testing import (
    TempDirTestCase,
    execute_subprocess_async,
    path_in_accelerate_package,
    require_deepspeed,
    require_multi_device,
)
from accelerate.utils import patch_environment


@require_deepspeed
@require_multi_device
class DeepSpeedALSTUlyssesSPTest(TempDirTestCase):
    test_scripts_folder = path_in_accelerate_package("test_utils", "scripts", "external_deps")

    @parameterized.expand([2, 3])
    def test_deepspeed_alst_ulysses_sp(self, stage):
        self.test_file_path = self.test_scripts_folder / "test_ds_alst_ulysses_sp.py"
        world_size = 2
        cmd = [
            "accelerate",
            "launch",
            f"--num_processes={world_size}",
            "--num_machines=1",
            "--machine_rank=0",
            "--mixed_precision=bf16",
            "--use_deepspeed",
            f"--zero_stage={stage}",
            self.test_file_path,
            f"--output_dir={self.tmpdir}",
        ]
        with patch_environment(omp_num_threads=1):
            execute_subprocess_async(cmd)


================================================
FILE: tests/deepspeed/test_deepspeed.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import inspect
import itertools
import json
import os
import tempfile
from copy import deepcopy
from pathlib import Path

import torch
from parameterized import parameterized
from torch.utils.data import BatchSampler, DataLoader, RandomSampler, SequentialSampler
from transformers import AutoConfig, AutoModel, AutoModelForCausalLM, get_scheduler

from accelerate.accelerator import Accelerator
from accelerate.scheduler import AcceleratedScheduler
from accelerate.state import AcceleratorState
from accelerate.test_utils.testing import (
    AccelerateTestCase,
    TempDirTestCase,
    execute_subprocess_async,
    path_in_accelerate_package,
    require_deepspeed,
    require_fp16,
    require_huggingface_suite,
    require_multi_device,
    require_non_cpu,
    run_first,
    slow,
)
from accelerate.test_utils.training import RegressionDataset, RegressionModel
from accelerate.utils import is_bf16_available, is_fp16_available, patch_environment, set_seed
from accelerate.utils.dataclasses import DeepSpeedPlugin
from accelerate.utils.deepspeed import (
    DeepSpeedEngineWrapper,
    DeepSpeedOptimizerWrapper,
    DeepSpeedSchedulerWrapper,
    DummyOptim,
    DummyScheduler,
)
from accelerate.utils.versions import compare_versions


set_seed(42)

GPT2_TINY = "sshleifer/tiny-gpt2"
MOBILEVIT = "apple/mobilevit-xx-small"
QWEN_MOE = "peft-internal-testing/tiny-random-qwen-1.5-MoE"

ZERO2 = "zero2"
ZERO3 = "zero3"

FP16 = "fp16"
BF16 = "bf16"

CUSTOM_OPTIMIZER = "custom_optimizer"
CUSTOM_SCHEDULER = "custom_scheduler"
DS_OPTIMIZER = "deepspeed_optimizer"
DS_SCHEDULER = "deepspeed_scheduler"

NO_CONFIG = "no_config"
CONFIG_WITH_NO_HIDDEN_SIZE = "config_with_no_hidden_size"
CONFIG_WITH_HIDDEN_SIZE = "config_with_hidden_size"
CONFIG_WITH_HIDDEN_SIZES = "config_with_hidden_sizes"

stages = [ZERO2, ZERO3]
optims = [CUSTOM_OPTIMIZER, DS_OPTIMIZER]
schedulers = [CUSTOM_SCHEDULER, DS_SCHEDULER]
model_types = [NO_CONFIG, CONFIG_WITH_NO_HIDDEN_SIZE, CONFIG_WITH_HIDDEN_SIZE, CONFIG_WITH_HIDDEN_SIZES]

dtypes = []
if is_bf16_available():
    dtypes.append(BF16)
if is_fp16_available():
    dtypes.append(FP16)


def parameterized_custom_name_func(func, param_num, param):
    # customize the test name generator function as we want both params to appear in the sub-test
    # name, as by default it shows only the first param
    param_based_name = parameterized.to_safe_name("_".join(str(x) for x in param.args))
    return f"{func.__name__}_{param_based_name}"


# Cartesian-product of zero stages with models to test
params = list(itertools.product(stages, dtypes))
optim_scheduler_params = list(itertools.product(optims, schedulers))


class DummyConfig:
    def __init__(self):
        self._name_or_path = "dummy"


@require_deepspeed
@require_non_cpu
class DeepSpeedConfigIntegration(AccelerateTestCase):
    def setUp(self):
        super().setUp()

        self._test_file_path = inspect.getfile(self.__class__)
        path = Path(self._test_file_path).resolve()
        self.test_file_dir_str = str(path.parents[0])

        self.ds_config_file = dict(
            zero2=f"{self.test_file_dir_str}/ds_config_zero2.json",
            zero3=f"{self.test_file_dir_str}/ds_config_zero3.json",
        )

        # use self.get_config_dict(stage) to use these to ensure the original is not modified
        with open(self.ds_config_file[ZERO2], encoding="utf-8") as f:
            config_zero2 = json.load(f)
        with open(self.ds_config_file[ZERO3], encoding="utf-8") as f:
            config_zero3 = json.load(f)
            # The following setting slows things down, so don't enable it by default unless needed by a test.
            # It's in the file as a demo for users since we want everything to work out of the box even if slower.
            config_zero3["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = False

        self.ds_config_dict = dict(zero2=config_zero2, zero3=config_zero3)

        self.dist_env = dict(
            ACCELERATE_USE_DEEPSPEED="true",
            MASTER_ADDR="localhost",
            MASTER_PORT="10999",
            RANK="0",
            LOCAL_RANK="0",
            WORLD_SIZE="1",
        )

    def get_config_dict(self, stage):
        # As some tests modify the dict, always make a copy
        return deepcopy(self.ds_config_dict[stage])

    @parameterized.expand(stages, name_func=parameterized_custom_name_func)
    def test_deepspeed_plugin(self, stage):
        # Test zero3_init_flag will be set to False when ZeRO stage != 3
        deepspeed_plugin = DeepSpeedPlugin(
            gradient_accumulation_steps=1,
            gradient_clipping=1.0,
            zero_stage=2,
            offload_optimizer_device="cpu",
            offload_param_device="cpu",
            zero3_save_16bit_model=True,
            zero3_init_flag=True,
        )
        assert not deepspeed_plugin.zero3_init_flag
        deepspeed_plugin.deepspeed_config = None

        # Test zero3_init_flag will be set to True only when ZeRO stage == 3
        deepspeed_plugin = DeepSpeedPlugin(
            gradient_accumulation_steps=1,
            gradient_clipping=1.0,
            zero_stage=3,
            offload_optimizer_device="cpu",
            offload_param_device="cpu",
            zero3_save_16bit_model=True,
            zero3_init_flag=True,
        )
        assert deepspeed_plugin.zero3_init_flag
        deepspeed_plugin.deepspeed_config = None

        # Test config files are loaded correctly
        deepspeed_plugin = DeepSpeedPlugin(hf_ds_config=self.ds_config_file[stage], zero3_init_flag=True)
        if stage == ZERO2:
            assert not deepspeed_plugin.zero3_init_flag
        elif stage == ZERO3:
            assert deepspeed_plugin.zero3_init_flag

        # Test `gradient_accumulation_steps` is set to 1 if unavailable in config file
        with tempfile.TemporaryDirectory() as dirpath:
            ds_config = self.get_config_dict(stage)
            del ds_config["gradient_accumulation_steps"]
            with open(os.path.join(dirpath, "ds_config.json"), "w") as out_file:
                json.dump(ds_config, out_file)
            deepspeed_plugin = DeepSpeedPlugin(hf_ds_config=os.path.join(dirpath, "ds_config.json"))
            assert deepspeed_plugin.deepspeed_config["gradient_accumulation_steps"] == 1
            deepspeed_plugin.deepspeed_config = None

        # Test `ValueError` is raised if `zero_optimization` is unavailable in config file
        with tempfile.TemporaryDirectory() as dirpath:
            ds_config = self.get_config_dict(stage)
            del ds_config["zero_optimization"]
            with open(os.path.join(dirpath, "ds_config.json"), "w") as out_file:
                json.dump(ds_config, out_file)
            with self.assertRaises(ValueError) as cm:
                deepspeed_plugin = DeepSpeedPlugin(hf_ds_config=os.path.join(dirpath, "ds_config.json"))
            assert "Please specify the ZeRO optimization config in the DeepSpeed config." in str(cm.exception)
            deepspeed_plugin.deepspeed_config = None

        # Test `deepspeed_config_process`
        deepspeed_plugin = DeepSpeedPlugin(hf_ds_config=self.ds_config_file[stage])
        kwargs = {
            "fp16.enabled": True,
            "bf16.enabled": False,
            "optimizer.params.lr": 5e-5,
            "optimizer.params.weight_decay": 0.0,
            "scheduler.params.warmup_min_lr": 0.0,
            "scheduler.params.warmup_max_lr": 5e-5,
            "scheduler.params.warmup_num_steps": 0,
            "train_micro_batch_size_per_gpu": 16,
            "gradient_clipping": 1.0,
            "train_batch_size": 16,
            "zero_optimization.reduce_bucket_size": 5e5,
            "zero_optimization.stage3_prefetch_bucket_size": 5e5,
            "zero_optimization.stage3_param_persistence_threshold": 5e5,
            "zero_optimization.stage3_gather_16bit_weights_on_model_save": False,
        }
        deepspeed_plugin.deepspeed_config_process(**kwargs)
        for ds_key_long, value in kwargs.items():
            config, ds_key = deepspeed_plugin.hf_ds_config.find_config_node(ds_key_long)
            if config.get(ds_key) is not None:
                assert config.get(ds_key) == value

        # Test mismatches
        mismatches = {
            "optimizer.params.lr": 1e-5,
            "optimizer.params.weight_decay": 1e-5,
            "gradient_accumulation_steps": 2,
        }
        with self.assertRaises(ValueError) as cm:
            new_kwargs = deepcopy(kwargs)
            new_kwargs.update(mismatches)
            deepspeed_plugin.deepspeed_config_process(**new_kwargs)
        for key in mismatches.keys():
            assert key in str(cm.exception), f"{key} is not in the exception message: {cm.exception}"

        # Test `ValueError` is raised if some config file fields with `auto` value is missing in `kwargs`
        deepspeed_plugin.deepspeed_config["optimizer"]["params"]["lr"] = "auto"
        with self.assertRaises(ValueError) as cm:
            del kwargs["optimizer.params.lr"]
            deepspeed_plugin.deepspeed_config_process(**kwargs)
        assert "`optimizer.params.lr` not found in kwargs." in str(cm.exception)

    @parameterized.expand(dtypes, name_func=parameterized_custom_name_func)
    def test_accelerate_state_deepspeed(self, dtype):
        AcceleratorState._reset_state(True)
        deepspeed_plugin = DeepSpeedPlugin(
            gradient_accumulation_steps=1,
            gradient_clipping=1.0,
            zero_stage=ZERO2,
            offload_optimizer_device="cpu",
            offload_param_device="cpu",
            zero3_save_16bit_model=True,
            zero3_init_flag=True,
        )
        with patch_environment(**self.dist_env):
            state = Accelerator(mixed_precision=dtype, deepspeed_plugin=deepspeed_plugin).state
            assert state.deepspeed_plugin.deepspeed_config[dtype]["enabled"]

    def test_init_zero3(self):
        deepspeed_plugin = DeepSpeedPlugin(
            gradient_accumulation_steps=1,
            gradient_clipping=1.0,
            zero_stage=3,
            offload_optimizer_device="cpu",
            offload_param_device="cpu",
            zero3_save_16bit_model=True,
            zero3_init_flag=True,
        )

        with patch_environment(**self.dist_env):
            accelerator = Accelerator(deepspeed_plugin=deepspeed_plugin)  # noqa: F841
            from transformers.integrations import is_deepspeed_zero3_enabled

            assert is_deepspeed_zero3_enabled()

    @parameterized.expand(optim_scheduler_params, name_func=parameterized_custom_name_func)
    @require_fp16
    def test_prepare_deepspeed(self, optim_type, scheduler_type):
        # 1. Testing with one of the ZeRO Stages is enough to test the `_prepare_deepspeed` function.
        # Here we test using ZeRO Stage 2 with FP16 enabled.
        from deepspeed.runtime.engine import DeepSpeedEngine

        kwargs = {
            "optimizer.params.lr": 5e-5,
            "optimizer.params.weight_decay": 0.0,
            "scheduler.params.warmup_min_lr": 0.0,
            "scheduler.params.warmup_max_lr": 5e-5,
            "scheduler.params.warmup_num_steps": 0,
            "train_micro_batch_size_per_gpu": 16,
            "gradient_clipping": 1.0,
            "train_batch_size": 16,
            "zero_optimization.reduce_bucket_size": 5e5,
            "zero_optimization.stage3_prefetch_bucket_size": 5e5,
            "zero_optimization.stage3_param_persistence_threshold": 5e5,
            "zero_optimization.stage3_gather_16bit_weights_on_model_save": False,
        }

        if optim_type == CUSTOM_OPTIMIZER and scheduler_type == CUSTOM_SCHEDULER:
            # Test custom optimizer + custom scheduler
            deepspeed_plugin = DeepSpeedPlugin(
                gradient_accumulation_steps=1,
                gradient_clipping=1.0,
                zero_stage=2,
                offload_optimizer_device="cpu",
                offload_param_device="cpu",
                zero3_save_16bit_model=False,
                zero3_init_flag=False,
            )
            with patch_environment(**self.dist_env):
                accelerator = Accelerator(mixed_precision="fp16", deepspeed_plugin=deepspeed_plugin)

                train_set = RegressionDataset(length=80)
                eval_set = RegressionDataset(length=20)
                train_dataloader = DataLoader(train_set, batch_size=16, shuffle=True)
                eval_dataloader = DataLoader(eval_set, batch_size=32, shuffle=False)
                model = AutoModel.from_pretrained(GPT2_TINY)
                optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
                lr_scheduler = get_scheduler(
                    name="linear",
                    optimizer=optimizer,
                    num_warmup_steps=0,
                    num_training_steps=1000,
                )
                dummy_optimizer = DummyOptim(params=model.parameters())
                dummy_lr_scheduler = DummyScheduler(dummy_optimizer)

                with self.assertRaises(ValueError) as cm:
                    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
                        model, dummy_optimizer, train_dataloader, eval_dataloader, lr_scheduler
                    )
                assert "You cannot create a `DummyOptim` without specifying an optimizer in the config file." in str(
                    cm.exception
                )
                with self.assertRaises(ValueError) as cm:
                    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
                        model, optimizer, train_dataloader, eval_dataloader, dummy_lr_scheduler
                    )
                assert (
                    "Either specify a scheduler in the config file or "
                    "pass in the `lr_scheduler_callable` parameter when using `accelerate.utils.DummyScheduler`."
                    in str(cm.exception)
                )

                with self.assertRaises(ValueError) as cm:
                    model, optimizer, lr_scheduler = accelerator.prepare(model, optimizer, lr_scheduler)
                assert (
                    "When using DeepSpeed, `accelerate.prepare()` requires you to pass at least one of training or evaluation dataloaders "
                    "with `batch_size` attribute returning an integer value "
                    "or alternatively set an integer value in `train_micro_batch_size_per_gpu` in the deepspeed config file "
                    "or assign integer value to `AcceleratorState().deepspeed_plugin.deepspeed_config['train_micro_batch_size_per_gpu']`."
                    in str(cm.exception)
                )

                model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
                    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
                )
                assert accelerator.deepspeed_config["zero_allow_untested_optimizer"]
                assert accelerator.deepspeed_config["train_batch_size"], 16
                assert type(model) is DeepSpeedEngine
                assert type(optimizer) is DeepSpeedOptimizerWrapper
                assert type(lr_scheduler) is AcceleratedScheduler
                assert type(accelerator.deepspeed_engine_wrapped) is DeepSpeedEngineWrapper

        elif optim_type == DS_OPTIMIZER and scheduler_type == DS_SCHEDULER:
            # Test DeepSpeed optimizer + DeepSpeed scheduler
            deepspeed_plugin = DeepSpeedPlugin(hf_ds_config=self.ds_config_file[ZERO2])
            with patch_environment(**self.dist_env):
                accelerator = Accelerator(deepspeed_plugin=deepspeed_plugin, mixed_precision="fp16")
                train_set = RegressionDataset(length=80)
                eval_set = RegressionDataset(length=20)
                train_dataloader = DataLoader(train_set, batch_size=10, shuffle=True)
                eval_dataloader = DataLoader(eval_set, batch_size=5, shuffle=False)
                model = AutoModel.from_pretrained(GPT2_TINY)
                optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
                lr_scheduler = get_scheduler(
                    name="linear",
                    optimizer=optimizer,
                    num_warmup_steps=0,
                    num_training_steps=1000,
                )
                dummy_optimizer = DummyOptim(params=model.parameters())
                dummy_lr_scheduler = DummyScheduler(dummy_optimizer)
                kwargs["train_batch_size"] = (
                    kwargs["train_micro_batch_size_per_gpu"]
                    * deepspeed_plugin.deepspeed_config["gradient_accumulation_steps"]
                    * accelerator.num_processes
                )
                accelerator.state.deepspeed_plugin.deepspeed_config_process(**kwargs)
                with self.assertRaises(ValueError) as cm:
                    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
                        model, optimizer, train_dataloader, eval_dataloader, dummy_lr_scheduler
                    )
                assert "You cannot specify an optimizer in the config file and in the code at the same time" in str(
                    cm.exception
                )

                with self.assertRaises(ValueError) as cm:
                    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
                        model, dummy_optimizer, train_dataloader, eval_dataloader, lr_scheduler
                    )
                assert "You cannot specify a scheduler in the config file and in the code at the same time" in str(
                    cm.exception
                )

                with self.assertRaises(ValueError) as cm:
                    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
                        model, dummy_optimizer, train_dataloader, eval_dataloader, lr_scheduler
                    )
                assert "You cannot specify a scheduler in the config file and in the code at the same time" in str(
                    cm.exception
                )

                model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
                    model, dummy_optimizer, train_dataloader, eval_dataloader, dummy_lr_scheduler
                )
                assert type(model) is DeepSpeedEngine
                assert type(optimizer) is DeepSpeedOptimizerWrapper
                assert type(lr_scheduler) is DeepSpeedSchedulerWrapper
                assert type(accelerator.deepspeed_engine_wrapped) is DeepSpeedEngineWrapper

        elif optim_type == CUSTOM_OPTIMIZER and scheduler_type == DS_SCHEDULER:
            # Test custom optimizer + DeepSpeed scheduler
            deepspeed_plugin = DeepSpeedPlugin(hf_ds_config=self.ds_config_file[ZERO2])
            with patch_environment(**self.dist_env):
                accelerator = Accelerator(deepspeed_plugin=deepspeed_plugin, mixed_precision="fp16")
                train_set = RegressionDataset(length=80)
                eval_set = RegressionDataset(length=20)
                train_dataloader = DataLoader(train_set, batch_size=10, shuffle=True)
                eval_dataloader = DataLoader(eval_set, batch_size=5, shuffle=False)
                model = AutoModel.from_pretrained(GPT2_TINY)
                optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
                lr_scheduler = get_scheduler(
                    name="linear",
                    optimizer=optimizer,
                    num_warmup_steps=0,
                    num_training_steps=1000,
                )
                dummy_optimizer = DummyOptim(params=model.parameters())
                dummy_lr_scheduler = DummyScheduler(dummy_optimizer)
                kwargs["train_batch_size"] = (
                    kwargs["train_micro_batch_size_per_gpu"]
                    * deepspeed_plugin.deepspeed_config["gradient_accumulation_steps"]
                    * accelerator.num_processes
                )
                accelerator.state.deepspeed_plugin.deepspeed_config_process(**kwargs)
                del accelerator.state.deepspeed_plugin.deepspeed_config["optimizer"]
                model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
                    model, optimizer, train_dataloader, eval_dataloader, dummy_lr_scheduler
                )
                assert type(model) is DeepSpeedEngine
                assert type(optimizer) is DeepSpeedOptimizerWrapper
                assert type(lr_scheduler) is DeepSpeedSchedulerWrapper
                assert type(accelerator.deepspeed_engine_wrapped) is DeepSpeedEngineWrapper
        elif optim_type == DS_OPTIMIZER and scheduler_type is CUSTOM_SCHEDULER:
            # Test deepspeed optimizer + custom scheduler
            deepspeed_plugin = DeepSpeedPlugin(hf_ds_config=self.ds_config_file[ZERO2])
            with patch_environment(**self.dist_env):
                accelerator = Accelerator(deepspeed_plugin=deepspeed_plugin, mixed_precision="fp16")
                train_set = RegressionDataset(length=80)
                eval_set = RegressionDataset(length=20)
                train_dataloader = DataLoader(train_set, batch_size=10, shuffle=True)
                eval_dataloader = DataLoader(eval_set, batch_size=5, shuffle=False)
                model = AutoModel.from_pretrained(GPT2_TINY)
                optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
                lr_scheduler = get_scheduler(
                    name="linear",
                    optimizer=optimizer,
                    num_warmup_steps=0,
                    num_training_steps=1000,
                )
                dummy_optimizer = DummyOptim(params=model.parameters())
                dummy_lr_scheduler = DummyScheduler(dummy_optimizer)
                kwargs["train_batch_size"] = (
                    kwargs["train_micro_batch_size_per_gpu"]
                    * deepspeed_plugin.deepspeed_config["gradient_accumulation_steps"]
                    * accelerator.num_processes
                )
                accelerator.state.deepspeed_plugin.deepspeed_config_process(**kwargs)
                del accelerator.state.deepspeed_plugin.deepspeed_config["scheduler"]
                with self.assertRaises(ValueError) as cm:
                    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
                        model, dummy_optimizer, train_dataloader, eval_dataloader, lr_scheduler
                    )
                assert (
                    "You can only specify `accelerate.utils.DummyScheduler` in the code when using `accelerate.utils.DummyOptim`."
                    in str(cm.exception)
                )

                # passing `DummyScheduler` without `lr_scheduler_callable` should fail
                with self.assertRaises(ValueError) as cm:
                    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
                        model, dummy_optimizer, train_dataloader, eval_dataloader, dummy_lr_scheduler
                    )
                assert (
                    "Either specify a scheduler in the config file or "
                    "pass in the `lr_scheduler_callable` parameter when using `accelerate.utils.DummyScheduler`."
                    in str(cm.exception)
                )

                # passing `lr_scheduler_callable` to DummyScheduler should enable DS Optim + Custom Scheduler
                def _lr_scheduler_callable(optimizer):
                    return get_scheduler(
                        name="linear",
                        optimizer=optimizer,
                        num_warmup_steps=0,
                        num_training_steps=1000,
                    )

                dummy_lr_scheduler = DummyScheduler(dummy_optimizer, lr_scheduler_callable=_lr_scheduler_callable)
                model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
                    model, dummy_optimizer, train_dataloader, eval_dataloader, dummy_lr_scheduler
                )

    def test_dataloader_with_batch_sampler(self):
        deepspeed_plugin = DeepSpeedPlugin(
            gradient_accumulation_steps=1,
            gradient_clipping=1.0,
            zero_stage=2,
            offload_optimizer_device="cpu",
            offload_param_device="cpu",
            zero3_save_16bit_model=False,
            zero3_init_flag=False,
        )
        with patch_environment(**self.dist_env):
            accelerator = Accelerator(mixed_precision="fp16", deepspeed_plugin=deepspeed_plugin)

            train_set = RegressionDataset(length=80)
            eval_set = RegressionDataset(length=20)
            train_dataloader = DataLoader(
                train_set, batch_sampler=BatchSampler(RandomSampler(train_set), batch_size=10, drop_last=False)
            )
            eval_dataloader = DataLoader(
                eval_set, batch_sampler=BatchSampler(SequentialSampler(eval_set), batch_size=10, drop_last=False)
            )
            model = AutoModel.from_pretrained(GPT2_TINY)
            optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
            lr_scheduler = get_scheduler(
                name="linear",
                optimizer=optimizer,
                num_warmup_steps=0,
                num_training_steps=1000,
            )

            with self.assertRaises(ValueError) as cm:
                model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
                    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
                )
            assert (
                "At least one of the dataloaders passed to `accelerate.prepare()` has `None` as batch size. "
                "Please set an integer value in `train_micro_batch_size_per_gpu` in the deepspeed config file "
                "or assign integer value to `AcceleratorState().deepspeed_plugin.deepspeed_config['train_micro_batch_size_per_gpu']`."
                in str(cm.exception)
            )

    @require_fp16
    def test_save_checkpoints(self):
        deepspeed_plugin = DeepSpeedPlugin(
            hf_ds_config=self.ds_config_file[ZERO3],
            zero3_init_flag=True,
        )
        del deepspeed_plugin.deepspeed_config["bf16"]
        kwargs = {
            "optimizer.params.lr": 5e-5,
            "optimizer.params.weight_decay": 0.0,
            "scheduler.params.warmup_min_lr": 0.0,
            "scheduler.params.warmup_max_lr": 5e-5,
            "scheduler.params.warmup_num_steps": 0,
            "train_micro_batch_size_per_gpu": 16,
            "gradient_clipping": 1.0,
            "train_batch_size": 16,
            "zero_optimization.reduce_bucket_size": 5e5,
            "zero_optimization.stage3_prefetch_bucket_size": 5e5,
            "zero_optimization.stage3_param_persistence_threshold": 5e5,
            "zero_optimization.stage3_gather_16bit_weights_on_model_save": False,
        }

        with patch_environment(**self.dist_env):
            accelerator = Accelerator(deepspeed_plugin=deepspeed_plugin, mixed_precision="fp16")
            kwargs["train_batch_size"] = (
                kwargs["train_micro_batch_size_per_gpu"]
                * deepspeed_plugin.deepspeed_config["gradient_accumulation_steps"]
                * accelerator.num_processes
            )
            accelerator.state.deepspeed_plugin.deepspeed_config_process(**kwargs)

            train_set = RegressionDataset(length=80)
            eval_set = RegressionDataset(length=20)
            train_dataloader = DataLoader(train_set, batch_size=16, shuffle=True)
            eval_dataloader = DataLoader(eval_set, batch_size=32, shuffle=False)
            model = AutoModelForCausalLM.from_pretrained("gpt2")
            dummy_optimizer = DummyOptim(params=model.parameters())
            dummy_lr_scheduler = DummyScheduler(dummy_optimizer)

            model, _, train_dataloader, eval_dataloader, _ = accelerator.prepare(
                model, dummy_optimizer, train_dataloader, eval_dataloader, dummy_lr_scheduler
            )
            with self.assertRaises(ValueError) as cm:
                accelerator.get_state_dict(model)
            msg = (
                "Cannot get 16bit model weights because `stage3_gather_16bit_weights_on_model_save` in DeepSpeed config is False. "
                "To save the model weights in 16bit, set `stage3_gather_16bit_weights_on_model_save` to True in DeepSpeed config file or "
                "set `zero3_save_16bit_model` to True when using `accelerate config`. "
                "To save the full checkpoint, run `model.save_checkpoint(save_dir)` and use `zero_to_fp32.py` to recover weights."
            )
            assert msg in str(cm.exception)

    def test_autofill_dsconfig(self):
        deepspeed_plugin = DeepSpeedPlugin(
            hf_ds_config=self.ds_config_file[ZERO3],
            zero3_init_flag=True,
        )
        del deepspeed_plugin.deepspeed_config["bf16"]
        del deepspeed_plugin.deepspeed_config["fp16"]

        with patch_environment(**self.dist_env):
            accelerator = Accelerator(deepspeed_plugin=deepspeed_plugin)
            train_set = RegressionDataset(length=80)
            eval_set = RegressionDataset(length=20)
            train_dataloader = DataLoader(train_set, batch_size=16, shuffle=True)
            eval_dataloader = DataLoader(eval_set, batch_size=32, shuffle=False)
            model = AutoModelForCausalLM.from_pretrained("gpt2")
            dummy_optimizer = DummyOptim(params=model.parameters(), lr=5e-5, weight_decay=1e-4)
            dummy_lr_scheduler = DummyScheduler(dummy_optimizer, warmup_num_steps=10, total_num_steps=1000)
            hidden_size = model.config.hidden_size
            model, _, train_dataloader, eval_dataloader, _ = accelerator.prepare(
                model, dummy_optimizer, train_dataloader, eval_dataloader, dummy_lr_scheduler
            )
            config = accelerator.deepspeed_config
            assert config["train_micro_batch_size_per_gpu"] == 16
            assert config["train_batch_size"] == 16

            assert config["optimizer"]["params"]["lr"] == 5e-05
            assert config["optimizer"]["params"]["weight_decay"] == 1e-4

            assert config["scheduler"]["params"]["warmup_min_lr"] == 0.0
            assert config["scheduler"]["params"]["warmup_max_lr"] == 5e-05
            assert config["scheduler"]["params"]["warmup_num_steps"] == 10

            assert config["gradient_clipping"] == 1.0
            assert config["zero_optimization"]["reduce_bucket_size"] == (hidden_size * hidden_size)
            assert config["zero_optimization"]["stage3_prefetch_bucket_size"] == int((0.9 * hidden_size) * hidden_size)
            assert config["zero_optimization"]["stage3_param_persistence_threshold"] == (10 * hidden_size)
            assert not config["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"]

    @parameterized.expand(model_types, name_func=parameterized_custom_name_func)
    @require_fp16
    def test_autofill_comm_buffers_dsconfig(self, model_type):
        deepspeed_plugin = DeepSpeedPlugin(
            hf_ds_config=self.ds_config_file[ZERO3],
            zero3_init_flag=True,
        )
        del deepspeed_plugin.deepspeed_config["bf16"]
        del deepspeed_plugin.deepspeed_config["fp16"]
        del deepspeed_plugin.deepspeed_config["optimizer"]
        del deepspeed_plugin.deepspeed_config["scheduler"]
        with patch_environment(**self.dist_env):
            accelerator = Accelerator(mixed_precision="fp16", deepspeed_plugin=deepspeed_plugin)
            train_set = RegressionDataset(length=80)
            eval_set = RegressionDataset(length=20)
            train_dataloader = DataLoader(train_set, batch_size=16, shuffle=True)
            eval_dataloader = DataLoader(eval_set, batch_size=32, shuffle=False)
            model = RegressionModel()
            if model_type == CONFIG_WITH_NO_HIDDEN_SIZE:
                model.config = DummyConfig()
            elif model_type == CONFIG_WITH_HIDDEN_SIZE:
                model.config = AutoConfig.from_pretrained(GPT2_TINY)
                hidden_size = model.config.hidden_size
            elif model_type == CONFIG_WITH_HIDDEN_SIZES:
                model.config = AutoConfig.from_pretrained(MOBILEVIT)
                hidden_size = max(model.config.hidden_sizes)
            optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
            lr_scheduler = get_scheduler(
                name="linear",
                optimizer=optimizer,
                num_warmup_steps=0,
                num_training_steps=1000,
            )

            if model_type == NO_CONFIG:
                with self.assertRaises(ValueError) as cm:
                    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
                        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
                    )
                msg = "Can't find `model.config` entry"
                assert msg in str(cm.exception)
            elif model_type == CONFIG_WITH_NO_HIDDEN_SIZE:
                with self.assertRaises(ValueError) as cm:
                    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
                        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
                    )
                msg = "Can find neither `model.config.hidden_size` nor `model.config.hidden_sizes`"
                assert msg in str(cm.exception)
            else:
                model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
                    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
                )
                zero_opt = accelerator.deepspeed_config["zero_optimization"]
                assert zero_opt["reduce_bucket_size"] == (hidden_size * hidden_size)
                assert zero_opt["stage3_prefetch_bucket_size"] == int((0.9 * hidden_size) * hidden_size)
                assert zero_opt["stage3_param_persistence_threshold"] == (10 * hidden_size)

    @parameterized.expand(dtypes, name_func=parameterized_custom_name_func)
    def test_autofill_dsconfig_from_ds_plugin(self, dtype):
        ds_config = self.ds_config_dict["zero3"]
        if dtype == BF16:
            del ds_config["fp16"]
        else:
            del ds_config["bf16"]
        ds_config[dtype]["enabled"] = "auto"
        ds_config["zero_optimization"]["stage"] = "auto"
        ds_config["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = "auto"
        ds_config["zero_optimization"]["offload_optimizer"]["device"] = "auto"
        ds_config["zero_optimization"]["offload_param"]["device"] = "auto"
        ds_config["gradient_accumulation_steps"] = "auto"
        ds_config["gradient_clipping"] = "auto"

        deepspeed_plugin = DeepSpeedPlugin(
            hf_ds_config=ds_config,
            zero3_init_flag=True,
            gradient_accumulation_steps=2,
            gradient_clipping=1.0,
            zero_stage=2,
            offload_optimizer_device="cpu",
            offload_param_device="cpu",
            zero3_save_16bit_model=True,
        )

        with patch_environment(**self.dist_env):
            accelerator = Accelerator(deepspeed_plugin=deepspeed_plugin, mixed_precision=dtype)
            config = accelerator.state.deepspeed_plugin.deepspeed_config
            assert config["gradient_clipping"] == 1.0
            assert config["gradient_accumulation_steps"] == 2
            assert config["zero_optimization"]["stage"] == 2
            assert config["zero_optimization"]["offload_optimizer"]["device"] == "cpu"
            assert config["zero_optimization"]["offload_param"]["device"] == "cpu"
            assert config["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"]
            assert config[dtype]["enabled"]

        AcceleratorState._reset_state(True)
        diff_dtype = "bf16" if dtype == "fp16" else "fp16"
        with patch_environment(**self.dist_env):
            with self.assertRaises(ValueError) as cm:
                accelerator = Accelerator(deepspeed_plugin=deepspeed_plugin, mixed_precision=diff_dtype)
            assert (
                f"`--mixed_precision` arg cannot be set to `{diff_dtype}` when `{dtype}` is set in the DeepSpeed config file."
                in str(cm.exception)
            )

        # base case of passing in `gradient_accumulation_steps` to `DeepSpeedPlugin`
        AcceleratorState._reset_state(True)
        deepspeed_plugin = DeepSpeedPlugin(zero_stage=2, gradient_accumulation_steps=4)
        with patch_environment(**self.dist_env):
            accelerator = Accelerator(deepspeed_plugin=deepspeed_plugin, mixed_precision=dtype)
            deepspeed_plugin = accelerator.state.deepspeed_plugin
            assert deepspeed_plugin.deepspeed_config["gradient_accumulation_steps"] == 4

        # filling the `auto` gradient_accumulation_steps via Accelerator's value
        AcceleratorState._reset_state(True)
        deepspeed_plugin = DeepSpeedPlugin(
            hf_ds_config=ds_config,
            zero3_init_flag=True,
            gradient_clipping=1.0,
            zero_stage=2,
            offload_optimizer_device="cpu",
            offload_param_device="cpu",
            zero3_save_16bit_model=True,
        )
        with patch_environment(**self.dist_env):
            accelerator = Accelerator(
                deepspeed_plugin=deepspeed_plugin, mixed_precision=dtype, gradient_accumulation_steps=8
            )
            train_set = RegressionDataset(length=80)
            eval_set = RegressionDataset(length=20)
            train_dataloader = DataLoader(train_set, batch_size=16, shuffle=True)
            eval_dataloader = DataLoader(eval_set, batch_size=32, shuffle=False)
            model = AutoModelForCausalLM.from_pretrained("gpt2")
            dummy_optimizer = DummyOptim(params=model.parameters(), lr=5e-5, weight_decay=1e-4)
            dummy_lr_scheduler = DummyScheduler(dummy_optimizer, warmup_num_steps=10, total_num_steps=1000)
            model, _, train_dataloader, eval_dataloader, _ = accelerator.prepare(
                model, dummy_optimizer, train_dataloader, eval_dataloader, dummy_lr_scheduler
            )
            deepspeed_plugin = accelerator.state.deepspeed_plugin
            assert deepspeed_plugin.deepspeed_config["gradient_accumulation_steps"] == 8

    def test_ds_config_assertions(self):
        ambiguous_env = self.dist_env.copy()
        ambiguous_env["ACCELERATE_CONFIG_DS_FIELDS"] = (
            "gradient_accumulation_steps,gradient_clipping,zero_stage,offload_optimizer_device,offload_param_device,zero3_save_16bit_model,mixed_precision"
        )

        with patch_environment(**ambiguous_env):
            with self.assertRaises(ValueError) as cm:
                deepspeed_plugin = DeepSpeedPlugin(
                    hf_ds_config=self.ds_config_file[ZERO3],
                    zero3_init_flag=True,
                    gradient_accumulation_steps=1,
                    gradient_clipping=1.0,
                    zero_stage=ZERO2,
                    offload_optimizer_device="cpu",
                    offload_param_device="cpu",
                    zero3_save_16bit_model=True,
                )
                _ = Accelerator(deepspeed_plugin=deepspeed_plugin, mixed_precision=FP16)
            assert (
                "If you are using an accelerate config file, remove others config variables mentioned in the above specified list."
                in str(cm.exception)
            )

    def test_ds_zero3_no_init_autofill(self):
        ds_config = {
            "bf16": {"enabled": True},
            "zero_optimization": {
                "stage": 3,
                "allgather_partitions": True,
                "allgather_bucket_size": 5e8,
                "overlap_comm": True,
                "reduce_scatter": True,
                "reduce_bucket_size": "auto",
                "contiguous_gradients": True,
                "stage3_gather_16bit_weights_on_model_save": False,
                "offload_optimizer": {"device": "none"},
                "offload_param": {"device": "none"},
            },
            "gradient_clipping": 1.0,
            "gradient_accumulation_steps": 1,
            "train_batch_size": "auto",
            "train_micro_batch_size_per_gpu": "auto",
            "steps_per_print": 2000000,
        }
        deepspeed_plugin = DeepSpeedPlugin(
            hf_ds_config=ds_config,
            zero3_init_flag=False,
        )
        with patch_environment(**self.dist_env):
            _ = Accelerator(deepspeed_plugin=deepspeed_plugin)
            _ = AutoModelForCausalLM.from_pretrained("gpt2")

    @parameterized.expand(stages, name_func=parameterized_custom_name_func)
    def test_ds_config(self, stage):
        deepspeed_plugin = DeepSpeedPlugin(
            hf_ds_config=self.ds_config_file[stage],
            zero3_init_flag=True,
        )
        assert deepspeed_plugin.zero_stage == int(stage.replace("zero", ""))

    @require_fp16
    def test_prepare_deepspeed_prepare_moe(self):
        if compare_versions("transformers", "<", "4.40") and compare_versions("deepspeed", "<", "0.14"):
            return
        deepspeed_plugin = DeepSpeedPlugin(
            zero3_init_flag=True,
            gradient_accumulation_steps=1,
            gradient_clipping=1.0,
            zero_stage=3,
            offload_optimizer_device="none",
            offload_param_device="none",
            zero3_save_16bit_model=True,
            transformer_moe_cls_names="Qwen2MoeSparseMoeBlock",
        )
        with patch_environment(**self.dist_env):
            accelerator = Accelerator(mixed_precision="fp16", deepspeed_plugin=deepspeed_plugin)
            accelerator.state.deepspeed_plugin.deepspeed_config["train_micro_batch_size_per_gpu"] = 1
            model = AutoModelForCausalLM.from_pretrained(QWEN_MOE)
            model = accelerator.prepare(model)
            from transformers.models.qwen2_moe.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock

            for module in model.modules():
                if isinstance(module, Qwen2MoeSparseMoeBlock):
                    assert hasattr(module, "_z3_leaf") and module._z3_leaf

    @run_first
    @require_fp16
    def test_basic_run(self):
        test_file_path = path_in_accelerate_package("test_utils", "scripts", "external_deps", "test_performance.py")
        with tempfile.TemporaryDirectory() as dirpath:
            cmd = [
                "accelerate",
                "launch",
                "--num_processes=1",
                "--num_machines=1",
                "--machine_rank=0",
                "--mixed_precision=fp16",
                "--use_deepspeed",
                "--gradient_accumulation_steps=1",
                "--zero_stage=2",
                "--offload_optimizer_device=none",
                "--offload_param_device=none",
                test_file_path,
                "--model_name_or_path=distilbert-base-uncased",
                "--num_epochs=1",
                f"--output_dir={dirpath}",
            ]
            with patch_environment(omp_num_threads=1):
                execute_subprocess_async(cmd)


@slow
@run_first
@require_deepspeed
@require_multi_device
class DeepSpeedIntegrationTest(TempDirTestCase):
    test_scripts_folder = path_in_accelerate_package("test_utils", "scripts", "external_deps")

    def setUp(self):
        super().setUp()
        self._test_file_path = inspect.getfile(self.__class__)
        path = Path(self._test_file_path).resolve()
        self.test_file_dir_str = str(path.parents[0])

        self.ds_config_file = dict(
            zero2=f"{self.test_file_dir_str}/ds_config_zero2.json",
            zero3=f"{self.test_file_dir_str}/ds_config_zero3.json",
        )

        self.stages = [1, 2, 3]
        self.zero3_offload_config = False
        self.performance_lower_bound = 0.82
        self.peak_memory_usage_upper_bound = {
            "multi_gpu_fp16": 3200,
            "deepspeed_stage_1_fp16": 1600,
            "deepspeed_stage_2_fp16": 2500,
            "deepspeed_stage_3_zero_init_fp16": 2800,
            # Disabling below test as it overwhelms the RAM memory usage
            # on CI self-hosted runner leading to tests getting killed.
            # "deepspeed_stage_3_cpu_offload_fp16": 1900,
        }
        self.n_train = 160
        self.n_val = 160

    @require_fp16
    def test_performance(self):
        self.test_file_path = self.test_scripts_folder / "test_performance.py"
        cmd = [
            "accelerate",
            "launch",
            "--num_processes=2",
            "--num_machines=1",
            "--machine_rank=0",
            "--mixed_precision=fp16",
            "--use_deepspeed",
            "--gradient_accumulation_steps=1",
            "--gradient_clipping=1",
            "--zero3_init_flag=True",
            "--zero3_save_16bit_model=True",
        ]
        for stage in self.stages:
            if stage == 1:
                continue
            cmd_stage = cmd.copy()
            cmd_stage.extend([f"--zero_stage={stage}"])
            cmd_stage.extend(["--offload_optimizer_device=none", "--offload_param_device=none"])
            if self.zero3_offload_config:
                with open(self.ds_config_file[ZERO3], encoding="utf-8") as f:
                    ds_config = json.load(f)
                    del ds_config["bf16"]
                    del ds_config["optimizer"]["params"]["torch_adam"]
                    del ds_config["optimizer"]["params"]["adam_w_mode"]
                    ds_config["fp16"]["enabled"] = True
                    ds_config_path = os.path.join(self.tmpdir, "ds_config.json")
                    with open(ds_config_path, "w") as out_file:
                        json.dump(ds_config, out_file)

                cmd_stage.extend([f"--deepspeed_config_file={ds_config_path}"])

            cmd_stage.extend(
                [
                    self.test_file_path,
                    f"--output_dir={self.tmpdir}",
                    f"--performance_lower_bound={self.performance_lower_bound}",
                ]
            )
            with patch_environment(omp_num_threads=1):
                execute_subprocess_async(cmd_stage)

    @require_fp16
    def test_checkpointing(self):
        self.test_file_path = self.test_scripts_folder / "test_checkpointing.py"
        cmd = [
            "accelerate",
            "launch",
            "--num_processes=2",
            "--num_machines=1",
            "--machine_rank=0",
            "--mixed_precision=fp16",
            "--use_deepspeed",
            "--gradient_accumulation_steps=1",
            "--gradient_clipping=1",
            "--zero3_init_flag=True",
            "--zero3_save_16bit_model=True",
        ]
        for stage in self.stages:
            if stage == 1:
                continue
            cmd_stage = cmd.copy()
            cmd_stage.extend([f"--zero_stage={stage}"])
            cmd_stage.extend(["--offload_optimizer_device=none", "--offload_param_device=none"])
            if self.zero3_offload_config:
                with open(self.ds_config_file[ZERO3], encoding="utf-8") as f:
                    ds_config = json.load(f)
                    del ds_config["bf16"]
                    del ds_config["optimizer"]["params"]["torch_adam"]
                    del ds_config["optimizer"]["params"]["adam_w_mode"]
                    ds_config["fp16"]["enabled"] = True
                    ds_config_path = os.path.join(self.tmpdir, "ds_config.json")
                    with open(ds_config_path, "w") as out_file:
                        json.dump(ds_config, out_file)

                cmd_stage.extend([f"--deepspeed_config_file={ds_config_path}"])

            cmd_stage.extend(
                [
                    self.test_file_path,
                    f"--output_dir={self.tmpdir}",
                    "--partial_train_epoch=1",
                ]
            )
            with patch_environment(omp_num_threads=1):
                execute_subprocess_async(cmd_stage)

            cmd_stage = cmd_stage[:-1]
            resume_from_checkpoint = os.path.join(self.tmpdir, "epoch_0")
            cmd_stage.extend(
                [
                    f"--resume_from_checkpoint={resume_from_checkpoint}",
                ]
            )
            with patch_environment(omp_num_threads=1):
                execute_subprocess_async(cmd_stage)

    @require_fp16
    def test_peak_memory_usage(self):
        if compare_versions("deepspeed", ">", "0.12.6"):
            self.skipTest(
                "The test fails when deepspeed>0.12.6. This is something that needs to be fixed on deepspeed library"
            )

        self.test_file_path = self.test_scripts_folder / "test_peak_memory_usage.py"
        cmd = [
            "accelerate",
            "launch",
            "--num_processes=2",
            "--num_machines=1",
            "--machine_rank=0",
        ]
        for spec, peak_mem_upper_bound in self.peak_memory_usage_upper_bound.items():
            cmd_stage = cmd.copy()
            if "fp16" in spec:
                cmd_stage.extend(["--mixed_precision=fp16"])

            if "multi_gpu" in spec:
                continue
            else:
                cmd_stage.extend(
                    [
                        "--use_deepspeed",
                        "--gradient_accumulation_steps=1",
                        "--gradient_clipping=1",
                        "--zero3_init_flag=True",
                        "--zero3_save_16bit_model=True",
                    ]
                )
                for i in range(3):
                    if f"stage_{i + 1}" in spec:
                        cmd_stage.extend([f"--zero_stage={i + 1}"])
                        break
                cmd_stage.extend(
                    [
                        "--offload_optimizer_device=none",
                        "--offload_param_device=none",
                        "--offload_optimizer_nvme_path=none",
                        "--offload_param_nvme_path=none",
                    ]
                )
                if "cpu_offload" in spec:
                    with open(self.ds_config_file[ZERO3], encoding="utf-8") as f:
                        ds_config = json.load(f)
                        del ds_config["bf16"]
                        del ds_config["fp16"]
                        del ds_config["optimizer"]["params"]["torch_adam"]
                        del ds_config["optimizer"]["params"]["adam_w_mode"]
                        ds_config_path = os.path.join(self.tmpdir, "ds_config.json")
                        with open(ds_config_path, "w") as out_file:
                            json.dump(ds_config, out_file)

                    cmd_stage.extend([f"--deepspeed_config_file={ds_config_path}"])

            cmd_stage.extend(
                [
                    self.test_file_path,
                    f"--output_dir={self.tmpdir}",
                    f"--peak_memory_upper_bound={peak_mem_upper_bound}",
                    f"--n_train={self.n_train}",
                    f"--n_val={self.n_val}",
                ]
            )
            with patch_environment(omp_num_threads=1):
                execute_subprocess_async(cmd_stage)

    def test_lr_scheduler(self):
        self.test_file_path = self.test_scripts_folder / "test_performance.py"
        cmd = [
            "accelerate",
            "launch",
            "--num_processes=2",
            "--num_machines=1",
            "--machine_rank=0",
            "--mixed_precision=no",
            "--use_deepspeed",
            "--gradient_accumulation_steps=1",
            "--gradient_clipping=1",
            "--zero3_init_flag=True",
            "--zero3_save_16bit_model=True",
            "--zero_stage=3",
            "--offload_optimizer_device=none",
            "--offload_param_device=none",
            self.test_file_path,
            f"--output_dir={self.tmpdir}",
            f"--performance_lower_bound={self.performance_lower_bound}",
        ]
        with patch_environment(omp_num_threads=1):
            execute_subprocess_async(cmd)

    @require_huggingface_suite
    def test_zero3_integration(self):
        self.test_file_path = self.test_scripts_folder / "test_zero3_integration.py"
        cmd = ["accelerate", "launch", "--num_processes=2", "--num_machines=1", self.test_file_path]
        with patch_environment(omp_num_threads=1):
            execute_subprocess_async(cmd)


================================================
FILE: tests/deepspeed/test_deepspeed_gradient_accumulation.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import inspect
import json
from pathlib import Path

import torch
from torch.utils.data import DataLoader
from transformers import AutoModel
from transformers.trainer_utils import set_seed

from accelerate.accelerator import Accelerator
from accelerate.test_utils.testing import AccelerateTestCase, require_deepspeed
from accelerate.test_utils.training import RegressionDataset
from accelerate.utils import patch_environment
from accelerate.utils.dataclasses import DeepSpeedPlugin


set_seed(42)

GPT2_TINY = "hf-internal-testing/tiny-random-gpt2"
ZERO2 = "zero2"
ZERO3 = "zero3"
FP16 = "fp16"


@require_deepspeed
class DeepSpeedGradientAccumulationTest(AccelerateTestCase):
    def setUp(self):
        super().setUp()

        self._test_file_path = inspect.getfile(self.__class__)
        path = Path(self._test_file_path).resolve()
        self.test_file_dir_str = str(path.parents[0])

        self.ds_config_file = dict(
            zero2=f"{self.test_file_dir_str}/ds_config_zero2.json",
            zero3=f"{self.test_file_dir_str}/ds_config_zero3.json",
        )

        # Load config files
        with open(self.ds_config_file[ZERO2], encoding="utf-8") as f:
            config_zero2 = json.load(f)
        with open(self.ds_config_file[ZERO3], encoding="utf-8") as f:
            config_zero3 = json.load(f)
            config_zero3["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = False

        self.ds_config_dict = dict(zero2=config_zero2, zero3=config_zero3)

        self.dist_env = dict(
            ACCELERATE_USE_DEEPSPEED="true",
            MASTER_ADDR="localhost",
            MASTER_PORT="10999",
            RANK="0",
            LOCAL_RANK="0",
            WORLD_SIZE="1",
        )

    def test_gradient_accumulation_boundary_integration(self):
        """Test that gradient accumulation boundaries are automatically handled by DeepSpeed integration."""
        gradient_accumulation_steps = 4

        deepspeed_plugin = DeepSpeedPlugin(
            gradient_accumulation_steps=gradient_accumulation_steps,
            gradient_clipping=1.0,
            zero_stage=2,
            offload_optimizer_device="cpu",
            offload_param_device="cpu",
            zero3_save_16bit_model=False,
            zero3_init_flag=False,
        )

        with patch_environment(**self.dist_env):
            accelerator = Accelerator(mixed_precision="fp16", deepspeed_plugin=deepspeed_plugin)

            # Setup simple training components
            train_set = RegressionDataset(length=80)
            train_dataloader = DataLoader(train_set, batch_size=16, shuffle=True)
            model = AutoModel.from_pretrained(GPT2_TINY)
            optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

            model, optimizer, train_dataloader = accelerator.prepare(model, optimizer, train_dataloader)

            model.train()

            # Test gradient accumulation with accumulate context manager
            batch_data = next(iter(train_dataloader))
            # Create proper input format for GPT2 model (RegressionDataset returns {"x": scalar, "y": scalar})
            # We need to create dummy input_ids for the GPT2 model
            batch_size = batch_data["x"].shape[0] if isinstance(batch_data["x"], torch.Tensor) else 1

            # Create dummy input_ids for GPT2 model and move to same device as model
            device = next(model.parameters()).device
            input_ids = torch.randint(0, 1000, (batch_size, 10), device=device)  # batch_size x sequence_length
            inputs = {"input_ids": input_ids}

            # Track sync_gradients values to verify correct gradient accumulation behavior
            sync_values = []

            # Simulate gradient accumulation steps
            for micro_step in range(gradient_accumulation_steps):
                with accelerator.accumulate(model):
                    sync_values.append(accelerator.sync_gradients)
                    outputs = model(**inputs)
                    # Use the last hidden state and create a simple loss
                    prediction = outputs.last_hidden_state.mean()
                    loss = prediction.sum()  # Simple scalar loss

                    # This should automatically handle gradient accumulation boundaries
                    accelerator.backward(loss)

                    if accelerator.sync_gradients:
                        optimizer.step()
                        optimizer.zero_grad()

            # Verify gradient accumulation pattern was correct
            # Should be False for first 3 steps, True for the last step
            expected_sync = [False, False, False, True]
            self.assertEqual(sync_values, expected_sync)

            # Reset step counter for accelerator
            accelerator.step = 0

    def test_clip_grad_norm_returns_deepspeed_grad_norm(self):
        """Test that clip_grad_norm_ works with DeepSpeed and returns gradient norm when available."""
        deepspeed_plugin = DeepSpeedPlugin(
            gradient_accumulation_steps=1,
            gradient_clipping=1.0,
            zero_stage=2,
            offload_optimizer_device="cpu",
            offload_param_device="cpu",
            zero3_save_16bit_model=False,
            zero3_init_flag=False,
        )

        with patch_environment(**self.dist_env):
            accelerator = Accelerator(mixed_precision="fp16", deepspeed_plugin=deepspeed_plugin)

            # Setup simple model
            model = AutoModel.from_pretrained(GPT2_TINY)
            optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

            # Create a simple dataloader for prepare to work
            train_set = RegressionDataset(length=16)
            train_dataloader = DataLoader(train_set, batch_size=16, shuffle=True)

            model, optimizer, train_dataloader = accelerator.prepare(model, optimizer, train_dataloader)

            # Perform a forward and backward pass to generate gradients
            batch_data = next(iter(train_dataloader))
            batch_size = len(batch_data["x"]) if isinstance(batch_data["x"], torch.Tensor) else 1

            # Create dummy input_ids for GPT2 model and move to same device as model
            device = next(model.parameters()).device
            input_ids = torch.randint(0, 1000, (batch_size, 10), device=device)
            inputs = {"input_ids": input_ids}

            # Forward pass
            outputs = model(**inputs)
            prediction = outputs.last_hidden_state.mean()
            loss = prediction.sum()

            # Backward pass to generate gradients
            accelerator.backward(loss)

            # Test that gradient clipping works and returns a value
            grad_norm = accelerator.clip_grad_norm_(model.parameters(), max_norm=1.0)
            # After backward pass, we should get a valid gradient norm (either from DeepSpeed or fallback)
            self.assertIsInstance(grad_norm, (int, float, type(None)))
            if grad_norm is not None:
                self.assertGreaterEqual(grad_norm, 0.0)

    def test_accelerator_backward_passes_sync_gradients(self):
        """Test that Accelerator.backward() passes sync_gradients to DeepSpeed wrapper."""
        deepspeed_plugin = DeepSpeedPlugin(
            gradient_accumulation_steps=2,
            gradient_clipping=1.0,
            zero_stage=2,
            offload_optimizer_device="cpu",
            offload_param_device="cpu",
            zero3_save_16bit_model=False,
            zero3_init_flag=False,
        )

        with patch_environment(**self.dist_env):
            accelerator = Accelerator(mixed_precision="fp16", deepspeed_plugin=deepspeed_plugin)

            # Setup simple model and data
            model = AutoModel.from_pretrained(GPT2_TINY)
            optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
            train_set = RegressionDataset(length=16)
            train_dataloader = DataLoader(train_set, batch_size=8, shuffle=True)

            model, optimizer, train_dataloader = accelerator.prepare(model, optimizer, train_dataloader)

            # Track sync_gradients values during backward calls
            sync_values = []

            # Test two gradient accumulation steps
            batch_data = next(iter(train_dataloader))
            # Create proper input format for GPT2 model
            batch_size = len(batch_data["x"]) if isinstance(batch_data["x"], torch.Tensor) else 1

            # Create dummy input_ids for GPT2 model and move to same device as model
            device = next(model.parameters()).device
            input_ids = torch.randint(0, 1000, (batch_size, 10), device=device)
            inputs = {"input_ids": input_ids}

            # First step - should have sync_gradients=False
            with accelerator.accumulate(model):
                sync_values.append(accelerator.sync_gradients)
                outputs = model(**inputs)
                prediction = outputs.last_hidden_state.mean()
                loss = prediction  # Simple loss
                accelerator.backward(loss)

            # Second step - should have sync_gradients=True
            with accelerator.accumulate(model):
                sync_values.append(accelerator.sync_gradients)
                outputs = model(**inputs)
                prediction = outputs.last_hidden_state.mean()
                loss = prediction  # Simple loss
                accelerator.backward(loss)

            # Verify sync_gradients pattern was correct
            self.assertEqual(len(sync_values), 2)
            self.assertFalse(sync_values[0])  # First step: not syncing
            self.assertTrue(sync_values[1])  # Second step: syncing


================================================
FILE: tests/deepspeed/test_deepspeed_multiple_model.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import inspect
import json
from functools import partial
from pathlib import Path

import torch
from transformers import AutoModelForCausalLM

from accelerate import Accelerator, DeepSpeedPlugin
from accelerate.commands.launch import launch_command, launch_command_parser
from accelerate.test_utils.testing import (
    AccelerateTestCase,
    path_in_accelerate_package,
    require_deepspeed,
    require_huggingface_suite,
    require_multi_device,
    require_non_cpu,
    run_first,
    slow,
)
from accelerate.test_utils.training import RegressionDataset
from accelerate.utils import patch_environment
from accelerate.utils.deepspeed import DummyOptim, DummyScheduler, get_active_deepspeed_plugin


GPT2_TINY = "hf-internal-testing/tiny-random-gpt2"


@require_deepspeed
@require_non_cpu
class DeepSpeedConfigIntegration(AccelerateTestCase):
    parser = launch_command_parser()
    test_scripts_folder = path_in_accelerate_package("test_utils", "scripts", "external_deps")

    def setUp(self):
        super().setUp()

        self.dist_env = dict(
            ACCELERATE_USE_DEEPSPEED="true",
            MASTER_ADDR="localhost",
            MASTER_PORT="10999",
            RANK="0",
            LOCAL_RANK="0",
            WORLD_SIZE="1",
        )

        self._test_file_path = inspect.getfile(self.__class__)
        path = Path(self._test_file_path).resolve()
        self.test_file_dir_str = str(path.parents[0])

        self.ds_config_file = dict(
            zero2=f"{self.test_file_dir_str}/ds_config_zero2.json",
            zero3_inference=f"{self.test_file_dir_str}/ds_config_zero3_model_only.json",
            zero3_training=f"{self.test_file_dir_str}/ds_config_zero3.json",
        )

        with open(self.ds_config_file["zero2"], encoding="utf-8") as f:
            self.config_zero2 = json.load(f)
        with open(self.ds_config_file["zero3_training"], encoding="utf-8") as f:
            self.config_zero3 = json.load(f)
        with open(self.ds_config_file["zero3_inference"], encoding="utf-8") as f:
            self.config_zero3_inference = json.load(f)

        self.model_init = partial(AutoModelForCausalLM.from_pretrained, GPT2_TINY)

    def get_ds_plugins(self, zero3_inference=False):
        ds_zero2 = DeepSpeedPlugin(
            hf_ds_config=self.config_zero2,
        )
        ds_zero3 = DeepSpeedPlugin(
            hf_ds_config=self.config_zero3 if not zero3_inference else self.config_zero3_inference,
        )
        return {"zero2": ds_zero2, "zero3": ds_zero3}

    def test_select_plugin(self):
        ds_plugins = self.get_ds_plugins()
        ds_zero2, ds_zero3 = ds_plugins.values()
        accelerator = Accelerator(
            deepspeed_plugin=ds_plugins,
        )
        # Accelerator's constructor should automatically enable the first plugin
        assert ds_zero2.selected
        assert not ds_zero3.selected
        assert get_active_deepspeed_plugin(accelerator.state) == ds_zero2
        assert accelerator.deepspeed_plugin == ds_zero2
        assert accelerator.state.get_deepspeed_plugin("zero2") == ds_zero2
        accelerator.state.select_deepspeed_plugin("zero3")
        assert not ds_zero2.selected
        assert ds_zero3.selected
        assert get_active_deepspeed_plugin(accelerator.state) == ds_zero3
        assert accelerator.deepspeed_plugin == ds_zero3
        assert accelerator.state.get_deepspeed_plugin("zero3") == ds_zero3
        accelerator.state.select_deepspeed_plugin("zero2")
        assert not ds_zero3.selected
        assert ds_zero2.selected
        assert get_active_deepspeed_plugin(accelerator.state) == ds_zero2
        assert accelerator.deepspeed_plugin == ds_zero2
        assert accelerator.state.get_deepspeed_plugin("zero2") == ds_zero2

    @require_huggingface_suite
    def test_config_reference_update(self):
        # Make sure that the transformers weakref is updating when we update the config
        ds_plugins = self.get_ds_plugins(zero3_inference=True)
        zero2, zero3 = ds_plugins.values()
        accelerator = Accelerator(deepspeed_plugin=ds_plugins)
        from transformers.integrations.deepspeed import deepspeed_config

        # Note that these have `auto` values being set so we need to adjust
        assert accelerator.deepspeed_plugin is zero2
        zero2.deepspeed_config["train_micro_batch_size_per_gpu"] = 1
        zero2.deepspeed_config.pop("train_batch_size")
        assert deepspeed_config() == accelerator.deepspeed_plugin.hf_ds_config.config

        accelerator.state.select_deepspeed_plugin("zero3")
        assert accelerator.deepspeed_plugin is zero3
        assert deepspeed_config() == accelerator.deepspeed_plugin.hf_ds_config.config

    def test_enable_disable_manually_set(self):
        ds_plugins = self.get_ds_plugins()
        ds_zero2, _ = ds_plugins.values()
        with self.assertRaises(ValueError):
            ds_zero2.select()
        accelerator = Accelerator(deepspeed_plugin=ds_plugins)
        accelerator.state.select_deepspeed_plugin("zero2")
        with self.assertRaises(NotImplementedError):
            ds_zero2.selected = False
        assert ds_zero2.selected

    def test_multiple_accelerators(self):
        ds_plugins = self.get_ds_plugins()
        ds_zero2, ds_zero3 = ds_plugins.values()
        _ = Accelerator(
            deepspeed_plugin=ds_zero2,
        )
        with self.assertRaises(NotImplementedError):
            _ = Accelerator(deepspeed_plugin=ds_zero3)

    def test_prepare_multiple_models_zero3_inference(self):
        with patch_environment(**self.dist_env):
            ds_plugins = self.get_ds_plugins(zero3_inference=True)
            accelerator = Accelerator(deepspeed_plugin=ds_plugins)
            # Using Zero-2 first
            model1 = self.model_init()
            optimizer = DummyOptim(model1.parameters())
            scheduler = DummyScheduler(optimizer)

            dataset = RegressionDataset()
            dataloader = torch.utils.data.DataLoader(dataset, batch_size=1)
            model1, optimizer, scheduler, dataloader = accelerator.prepare(model1, optimizer, scheduler, dataloader)
            accelerator.state.select_deepspeed_plugin("zero3")
            model2 = self.model_init()
            with self.assertLogs(level="WARNING") as captured:
                model2 = accelerator.prepare(model2)
                self.assertIn(
                    "A wrapped DeepSpeed engine reference is currently tied for this `Accelerator()` instance.",
                    captured.output[0],
                )

            assert accelerator.deepspeed_engine_wrapped.engine is model1

    @run_first
    @require_huggingface_suite
    @require_multi_device
    @slow
    def test_train_multiple_models(self):
        self.test_file_path = self.test_scripts_folder / "test_ds_multiple_model.py"
        args = ["--num_processes=2", "--num_machines=1", str(self.test_file_path)]
        args = self.parser.parse_args(args)
        launch_command(args)


================================================
FILE: tests/fsdp/test_fsdp.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import functools
import os
from contextlib import nullcontext

import torch
from transformers import AutoModel

from accelerate.accelerator import Accelerator
from accelerate.state import AcceleratorState, DistributedType
from accelerate.test_utils.testing import (
    AccelerateTestCase,
    TempDirTestCase,
    execute_subprocess_async,
    get_launch_command,
    path_in_accelerate_package,
    require_fp16,
    require_fsdp2,
    require_multi_device,
    require_non_cpu,
    require_non_torch_xla,
    run_first,
    slow,
)
from accelerate.utils import is_bf16_available, is_fp16_available, is_hpu_available, patch_environment, set_seed
from accelerate.utils.constants import (
    FSDP2_STATE_DICT_TYPE,
    FSDP_AUTO_WRAP_POLICY,
    FSDP_BACKWARD_PREFETCH,
    FSDP_SHARDING_STRATEGY,
    FSDP_STATE_DICT_TYPE,
)
from accelerate.utils.dataclasses import FullyShardedDataParallelPlugin
from accelerate.utils.fsdp_utils import disable_fsdp_ram_efficient_loading, enable_fsdp_ram_efficient_loading


set_seed(42)


BERT_BASE_CASED = "bert-base-cased"
LLAMA_TESTING = "hf-internal-testing/tiny-random-LlamaForCausalLM"
FP16 = "fp16"
BF16 = "bf16"

dtypes = []
if is_fp16_available():
    dtypes.append(FP16)
if is_bf16_available():
    dtypes.append(BF16)


@require_non_cpu
@require_non_torch_xla
class FSDPPluginIntegration(AccelerateTestCase):
    def setUp(self):
        super().setUp()

        self.dist_env = dict(
            MASTER_ADDR="localhost",
            MASTER_PORT="10999",
            RANK="0",
            LOCAL_RANK="0",
            WORLD_SIZE="1",
        )

        self.fsdp1_env = dict(ACCELERATE_USE_FSDP="true", **self.dist_env)
        self.fsdp2_env = dict(ACCELERATE_USE_FSDP="true", **self.dist_env, FSDP_VERSION="2")

        self.fsdp_envs = {
            1: self.fsdp1_env,
            2: self.fsdp2_env,
        }

        self.current_fsdp_version = 1

    def test_sharding_strategy(self):
        from torch.distributed.fsdp.fully_sharded_data_parallel import ShardingStrategy

        SHARDING_STRATEGIES = {
            1: FSDP_SHARDING_STRATEGY,
            2: [True, False],
        }

        SHARDING_STRATEGY_NAMES = {
            1: "FSDP_SHARDING_STRATEGY",
            2: "FSDP_RESHARD_AFTER_FORWARD",
        }

        # check that giving enums works fine
        # Only supported in FSDP1
        for i, strategy in enumerate(FSDP_SHARDING_STRATEGY):
            env = self.fsdp_envs[1].copy()
            env["FSDP_SHARDING_STRATEGY"] = f"{i + 1}"
            with patch_environment(**env):
                fsdp_plugin = FullyShardedDataParallelPlugin()
                assert fsdp_plugin.sharding_strategy == ShardingStrategy(i + 1)
            fsdp_plugin = FullyShardedDataParallelPlugin(sharding_strategy=ShardingStrategy(i + 1))
            assert fsdp_plugin.sharding_strategy == ShardingStrategy(i + 1)

        # check that giving names works fine, also needed for FSDP2
        fsdp_version = self.current_fsdp_version
        for i, strategy in enumerate(SHARDING_STRATEGIES[fsdp_version]):
            env = self.fsdp_envs[fsdp_version].copy()
            env[SHARDING_STRATEGY_NAMES[fsdp_version]] = strategy
            with patch_environment(**env):
                fsdp_plugin = FullyShardedDataParallelPlugin()
            if fsdp_version == 1:
                assert fsdp_plugin.sharding_strategy == ShardingStrategy(i + 1)
                assert fsdp_plugin.reshard_after_forward is None
            else:
                assert fsdp_plugin.reshard_after_forward == strategy
                assert fsdp_plugin.sharding_strategy is None

            env = self.fsdp_envs[fsdp_version].copy()
            with patch_environment(**env):
                if fsdp_version == 1:
                    fsdp_plugin = FullyShardedDataParallelPlugin(sharding_strategy=ShardingStrategy(i + 1))
                    assert fsdp_plugin.sharding_strategy == ShardingStrategy(i + 1)
                    assert fsdp_plugin.reshard_after_forward is None
                else:
                    fsdp_plugin = FullyShardedDataParallelPlugin(reshard_after_forward=strategy)
                    assert fsdp_plugin.reshard_after_forward == strategy
                    assert fsdp_plugin.sharding_strategy is None

    def test_backward_prefetch(self):
        from torch.distributed.fsdp.fully_sharded_data_parallel import BackwardPrefetch

        _warning_message_fsdp2 = "backward_prefetch is not supported in FSDP2. Setting backward prefetch to None."

        fsdp_version = self.current_fsdp_version
        for i, prefetch_policy in enumerate(FSDP_BACKWARD_PREFETCH):
            # FSDP2 warns about backward prefetch and sets to None
            ctx = (
                self.assertLogs("accelerate.utils.dataclasses", level="WARNING")
                if fsdp_version == 2 and prefetch_policy != "NO_PREFETCH"
                else nullcontext()
            )
            expected_value = (
                None if (prefetch_policy == "NO_PREFETCH" or fsdp_version == 2) else BackwardPrefetch(i + 1)
            )
            env = self.fsdp_envs[fsdp_version].copy()
            env["FSDP_BACKWARD_PREFETCH"] = prefetch_policy
            with patch_environment(**env), ctx as cm:
                fsdp_plugin = FullyShardedDataParallelPlugin()
                assert fsdp_plugin.backward_prefetch == expected_value, (
                    f"Actual: {fsdp_plugin.backward_prefetch} != Expected: {expected_value}"
                )
                if cm:
                    self.assertTrue(any(_warning_message_fsdp2 in out for out in cm.output))

            # Check if torch enum works
            env = self.fsdp_envs[fsdp_version].copy()
            with patch_environment(**env), ctx as cm:
                if prefetch_policy != "NO_PREFETCH":
                    fsdp_plugin = FullyShardedDataParallelPlugin(backward_prefetch=BackwardPrefetch(i + 1))
                    assert fsdp_plugin.backward_prefetch == expected_value
                    if cm:
                        self.assertTrue(any(_warning_message_fsdp2 in out for out in cm.output))

            # Check if name works
            with patch_environment(**env), ctx as cm:
                fsdp_plugin = FullyShardedDataParallelPlugin(backward_prefetch=prefetch_policy)
                assert fsdp_plugin.backward_prefetch == expected_value
                if cm:
                    self.assertTrue(any(_warning_message_fsdp2 in out for out in cm.output))

    def test_state_dict_type(self):
        from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType

        fsdp_version = self.current_fsdp_version
        for i, state_dict_type in enumerate(FSDP_STATE_DICT_TYPE):
            cm = (
                self.assertRaises(ValueError)
                if (fsdp_version == 2 and state_dict_type not in FSDP2_STATE_DICT_TYPE)
                else nullcontext()
            )
            env = self.fsdp_envs[fsdp_version].copy()
            env["FSDP_STATE_DICT_TYPE"] = state_dict_type
            with patch_environment(**env), cm:
                fsdp_plugin = FullyShardedDataParallelPlugin()
                assert fsdp_plugin.state_dict_type == StateDictType(i + 1)
                if state_dict_type == "FULL_STATE_DICT":
                    assert fsdp_plugin.state_dict_config.offload_to_cpu
                    assert fsdp_plugin.state_dict_config.rank0_only

            env = self.fsdp_envs[fsdp_version].copy()
            with patch_environment(**env), cm:
                fsdp_plugin = FullyShardedDataParallelPlugin(state_dict_type=StateDictType(i + 1))
                assert fsdp_plugin.state_dict_type == StateDictType(i + 1)
                if state_dict_type == "FULL_STATE_DICT":
                    assert fsdp_plugin.state_dict_config.offload_to_cpu
                    assert fsdp_plugin.state_dict_config.rank0_only

        # We can also override the state_dict_type,
        # typical case: user trains with sharded, but final save is with full
        fsdp_plugin = FullyShardedDataParallelPlugin(state_dict_type="FULL_STATE_DICT")
        fsdp_plugin.set_state_dict_type("SHARDED_STATE_DICT")
        assert fsdp_plugin.state_dict_type == StateDictType.SHARDED_STATE_DICT

    def test_auto_wrap_policy(self):
        fsdp_version = self.current_fsdp_version
        for model_name in [LLAMA_TESTING, BERT_BASE_CASED]:
            model = AutoModel.from_pretrained(model_name)
            layer_to_wrap = "LlamaDecoderLayer" if model_name == LLAMA_TESTING else "BertLayer"
            for policy in FSDP_AUTO_WRAP_POLICY:
                env = self.fsdp_envs[fsdp_version].copy()
                env["FSDP_AUTO_WRAP_POLICY"] = policy
                transformer_cls_to_wrap = None
                min_num_params = None
                env.pop("FSDP_TRANSFORMER_CLS_TO_WRAP", None)
                env.pop("FSDP_MIN_NUM_PARAMS", None)
                if policy == "TRANSFORMER_BASED_WRAP":
                    env["FSDP_TRANSFORMER_CLS_TO_WRAP"] = layer_to_wrap
                    transformer_cls_to_wrap = layer_to_wrap
                elif policy == "SIZE_BASED_WRAP":
                    env["FSDP_MIN_NUM_PARAMS"] = "2000"
                    min_num_params = 2000
                # First test via env
                with patch_environment(**env):
                    fsdp_plugin = FullyShardedDataParallelPlugin()
                    fsdp_plugin.set_auto_wrap_policy(model)
                if policy == "NO_WRAP":
                    assert fsdp_plugin.auto_wrap_policy is None
                else:
                    assert isinstance(fsdp_plugin.auto_wrap_policy, functools.partial)

                # Then manually set the policy
                env = self.fsdp_envs[fsdp_version].copy()
                with patch_environment(**env):
                    fsdp_plugin = FullyShardedDataParallelPlugin(
                        auto_wrap_policy=policy,
                        transformer_cls_names_to_wrap=transformer_cls_to_wrap,
                        min_num_params=min_num_params,
                    )
                    fsdp_plugin.set_auto_wrap_policy(model)
                    if policy == "NO_WRAP":
                        assert fsdp_plugin.auto_wrap_policy is None
                    else:
                        assert isinstance(fsdp_plugin.auto_wrap_policy, functools.partial)

        env = self.fsdp_envs[fsdp_version].copy()
        env["FSDP_AUTO_WRAP_POLICY"] = "TRANSFORMER_BASED_WRAP"
        env["FSDP_TRANSFORMER_CLS_TO_WRAP"] = "T5Layer"
        with patch_environment(**env):
            fsdp_plugin = FullyShardedDataParallelPlugin()
            with self.assertRaises(Exception) as cm:
                fsdp_plugin.set_auto_wrap_policy(model)
            assert "Could not find the transformer layer class T5Layer in the model." in str(cm.exception)

        env = self.fsdp_envs[fsdp_version].copy()
        with patch_environment(**env):
            fsdp_plugin = FullyShardedDataParallelPlugin(
                auto_wrap_policy="TRANSFORMER_BASED_WRAP",
                transformer_cls_names_to_wrap="T5Layer",
            )
        with self.assertRaises(Exception) as cm:
            fsdp_plugin.set_auto_wrap_policy(model)
        assert "Could not find the transformer layer class T5Layer in the model." in str(cm.exception)

        env = self.fsdp_envs[fsdp_version].copy()
        env["FSDP_AUTO_WRAP_POLICY"] = "SIZE_BASED_WRAP"
        env["FSDP_MIN_NUM_PARAMS"] = "0"
        with patch_environment(**env):
            fsdp_plugin = FullyShardedDataParallelPlugin()
            fsdp_plugin.set_auto_wrap_policy(model)
            assert fsdp_plugin.auto_wrap_policy is None

        env = self.fsdp_envs[fsdp_version].copy()
        with patch_environment(**env):
            fsdp_plugin = FullyShardedDataParallelPlugin(
                auto_wrap_policy="SIZE_BASED_WRAP",
                min_num_params=0,
            )
        fsdp_plugin.set_auto_wrap_policy(model)
        assert fsdp_plugin.auto_wrap_policy is None

    def test_mixed_precision(self):
        fsdp_version = self.current_fsdp_version
        if fsdp_version == 2:
            from torch.amp.grad_scaler import GradScaler as Scaler
            from torch.distributed.fsdp import MixedPrecisionPolicy as MP
        else:
            from torch.distributed.fsdp import MixedPrecision as MP
            from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler as Scaler

        for mp_dtype in dtypes:
            env = self.fsdp_envs[fsdp_version].copy()
            env["ACCELERATE_MIXED_PRECISION"] = mp_dtype
            extra_arg = "buffer_dtype" if fsdp_version == 1 else "output_dtype"
            with patch_environment(**env):
                accelerator = Accelerator()
                if mp_dtype == "fp16":
                    dtype = torch.float16
                elif mp_dtype == "bf16":
                    dtype = torch.bfloat16
                mp_policy = MP(param_dtype=dtype, reduce_dtype=dtype, **{extra_arg: dtype})
                assert accelerator.state.fsdp_plugin.mixed_precision_policy == mp_policy
                if mp_dtype == FP16:
                    assert isinstance(accelerator.scaler, Scaler)
                elif mp_dtype == BF16:
                    assert accelerator.scaler is None
                AcceleratorState._reset_state(True)

            env = self.fsdp_envs[fsdp_version].copy()
            with patch_environment(**env):
                plugin = FullyShardedDataParallelPlugin(mixed_precision_policy=mp_dtype)
                assert plugin.mixed_precision_policy == mp_policy
            with patch_environment(**env):
                plugin = FullyShardedDataParallelPlugin(
                    mixed_precision_policy={"param_dtype": dtype, "reduce_dtype": dtype, **{extra_arg: dtype}}
                )
                assert plugin.mixed_precision_policy == mp_policy
            with patch_environment(**env):
                accelerator = Accelerator(fsdp_plugin=plugin)
                assert accelerator.state.fsdp_plugin.mixed_precision_policy == mp_policy
            AcceleratorState._reset_state(True)

    def test_mixed_precision_buffer_autocast_override(self):
        from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision
        from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler

        if self.current_fsdp_version == 2:
            return

        # We're not testing this for FSDP2 because FSDP2 doesn't support `buffer_dtype` rather only `output_dtype`
        # TODO(s1ro1): what should we do if `buffer_autocast` is set to True in FSDP2?

        for mp_dtype in dtypes:
            if mp_dtype == "fp16":
                dtype = torch.float16
            elif mp_dtype == "bf16":
                dtype = torch.bfloat16
            mp_policy = MixedPrecision(param_dtype=dtype, reduce_dtype=dtype, buffer_dtype=torch.float32)

            env = self.fsdp_envs[1].copy()
            env["ACCELERATE_MIXED_PRECISION"] = mp_dtype
            with patch_environment(**env):
                accelerator = Accelerator()
                accelerator.state.fsdp_plugin.set_mixed_precision(dtype, buffer_autocast=True, override=True)
                assert accelerator.state.fsdp_plugin.mixed_precision_policy == mp_policy
                if mp_dtype == FP16:
                    assert isinstance(accelerator.scaler, ShardedGradScaler)
                elif mp_dtype == BF16:
                    assert accelerator.scaler is None
                AcceleratorState._reset_state(True)

    def test_cpu_offload(self):
        fsdp_version = self.current_fsdp_version
        if fsdp_version == 2:
            from torch.distributed.fsdp import CPUOffloadPolicy, OffloadPolicy
        else:
            from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload

        for flag in [True, False]:
            env = self.fsdp_envs[fsdp_version].copy()
            env["FSDP_OFFLOAD_PARAMS"] = str(flag).lower()

            # FSDP2 has a different class for not offloading, therefore we need to check for both cases
            if fsdp_version == 2 and flag:
                expected_value = CPUOffloadPolicy()
            elif fsdp_version == 2 and not flag:
                expected_value = OffloadPolicy()
            else:
                expected_value = CPUOffload(offload_params=flag)
            with patch_environment(**env):
                fsdp_plugin = FullyShardedDataParallelPlugin()
                assert fsdp_plugin.cpu_offload == expected_value

            env = self.fsdp_envs[fsdp_version].copy()
            with patch_environment(**env):
                fsdp_plugin = FullyShardedDataParallelPlugin(cpu_offload=flag)
                assert fsdp_plugin.cpu_offload == expected_value

    def test_cpu_ram_efficient_loading(self):
        fsdp_version = self.current_fsdp_version
        env = self.fsdp_envs[fsdp_version].copy()
        enable_fsdp_ram_efficient_loading()
        with patch_environment(**env):
            fsdp_plugin = FullyShardedDataParallelPlugin()
            assert fsdp_plugin.cpu_ram_efficient_loading is True
            assert os.environ.get("FSDP_CPU_RAM_EFFICIENT_LOADING") == "True"

        disable_fsdp_ram_efficient_loading()
        env = self.fsdp_envs[fsdp_version].copy()
        with patch_environment(**env):
            fsdp_plugin = FullyShardedDataParallelPlugin()
            assert fsdp_plugin.cpu_ram_efficient_loading is False
            assert os.environ.get("FSDP_CPU_RAM_EFFICIENT_LOADING") == "False"

    def test_ignored_modules_regex(self):
        # Check that FSDP's ignored_modules can be a string, in which case it is treated as a regex
        env = self.fsdp_envs[1].copy()
        env["FSDP_IGNORED_MODULES"] = ".*\\.q_proj$"
        with patch_environment(**env):
            accelerator = Accelerator()
            model = AutoModel.from_pretrained(LLAMA_TESTING)
            model = accelerator.prepare(model)
            if self.current_fsdp_version == 1:
                # model has 2 layers
                layers_to_ignore = {model.layers[0].self_attn.q_proj, model.layers[1].self_attn.q_proj}
                assert model._ignored_modules == layers_to_ignore
            else:
                params_to_ignore = {model.layers[0].self_attn.q_proj.weight, model.layers[1].self_attn.q_proj.weight}
                assert model._ignored_params == params_to_ignore


@require_fsdp2
@require_non_cpu
@require_non_torch_xla
class FSDP2PluginIntegration(FSDPPluginIntegration):
    def setUp(self):
        super().setUp()
        self.current_fsdp_version = 2

    def test_param_mapping_error_handling(self):
        """Test FSDP2's defensive error handling for parameter mapping failures in tied/non-tied cases."""
        from unittest.mock import Mock, patch

        fsdp_plugin = FullyShardedDataParallelPlugin(fsdp_version=2)
        accelerator = Accelerator()
        accelerator.state.distributed_type = DistributedType.FSDP
        accelerator.state.fsdp_plugin = fsdp_plugin

        mock_model = Mock(spec=torch.nn.Module)
        mock_model.config = Mock(tie_word_embeddings=True)
        mock_optimizer = Mock(spec=torch.optim.Optimizer)
        mock_optimizer.param_groups = []
        result = [mock_model, mock_optimizer]

        # Tied case
        old_named_params = {"model.embed_tokens.weight": 12345, "lm_head.weight": 67890, "other.weight": 11111}
        new_named_params = {"model.embed_tokens.weight": 12345, "other.weight": 11111}
        with patch.object(accelerator, "_get_named_parameters", side_effect=[old_named_params, new_named_params]):
            with patch("accelerate.accelerator.fsdp2_canonicalize_names", side_effect=lambda x: x):
                with patch("accelerate.accelerator.fsdp2_prepare_model", return_value=mock_model):
                    with patch.object(accelerator.state.fsdp_plugin, "set_auto_wrap_policy"):
                        with self.assertRaises(ValueError) as cm:
                            accelerator._prepare_fsdp2(*result)
                        error_msg = str(cm.exception)
                        self.assertIn("FSDP2 mapping failed", error_msg)
                        self.assertIn("tied embeddings", error_msg)
                        self.assertIn("lm_head.weight", error_msg)
                        self.assertIn("tie_word_embeddings = False", error_msg)

        # Non-tied case
        old_named_params = {"layer1.weight": 12345, "some_other.weight": 67890}
        new_named_params = {"layer1.weight": 12345}
        with patch.object(accelerator, "_get_named_parameters", side_effect=[old_named_params, new_named_params]):
            with patch("accelerate.accelerator.fsdp2_canonicalize_names", side_effect=lambda x: x):
                with patch("accelerate.accelerator.fsdp2_prepare_model", return_value=mock_model):
                    with patch.object(accelerator.state.fsdp_plugin, "set_auto_wrap_policy"):
                        with self.assertRaises(KeyError) as cm:
                            accelerator._prepare_fsdp2(*result)
                        error_msg = str(cm.exception)
                        self.assertIn("Parameters missing after FSDP2 wrapping", error_msg)
                        self.assertIn("some_other.weight", error_msg)

        AcceleratorState._reset_state(True)


@run_first
# Skip this test when TorchXLA is available because accelerate.launch does not support TorchXLA FSDP.
@require_non_torch_xla
@require_multi_device
@slow
class FSDPIntegrationTest(TempDirTestCase):
    test_scripts_folder = path_in_accelerate_package("test_utils", "scripts", "external_deps")

    def setUp(self):
        super().setUp()
        self.performance_lower_bound = 0.70 if is_hpu_available() else 0.82
        self.fsdp1_performance_configs = [
            "fsdp_shard_grad_op_transformer_based_wrap",
            "fsdp_full_shard_transformer_based_wrap",
        ]
        # FSDP2 doesn't currently support other than full_shard/no_shard equivalents
        self.fsdp2_performance_configs = ["fsdp_full_shard_transformer_based_wrap"]
        self.performance_configs = {
            1: self.fsdp1_performance_configs,
            2: self.fsdp2_performance_configs,
        }

        self.fsdp1_peak_memory_usage_upper_bound = {
            "multi_gpu_fp16": 3200,
            "fsdp_shard_grad_op_transformer_based_wrap_fp16": 2000,
            "fsdp_full_shard_transformer_based_wrap_fp16": 1900,
            # Disabling below test as it overwhelms the RAM memory usage
            # on CI self-hosted runner leading to tests getting killed.
            # "fsdp_full_shard_cpu_offload_transformer_based_wrap_fp32": 1500,  # fp16 was leading to indefinite hang
        }
        self.fsdp2_peak_memory_usage_upper_bound = {
            "multi_gpu_fp16": 3200,
            "fsdp_full_shard_transformer_based_wrap_fp16": 1900,
        }
        self.peak_memory_usage_upper_bound = {
            1: self.fsdp1_peak_memory_usage_upper_bound,
            2: self.fsdp2_peak_memory_usage_upper_bound,
        }
        self.n_train = 160
        self.n_val = 160

        self.current_fsdp_version = 1

    @require_fp16
    def test_performance(self):
        self.test_file_path = self.test_scripts_folder / "test_performance.py"
        fsdp_version = self.current_fsdp_version
        cmd = get_launch_command(
            num_processes=2, num_machines=1, machine_rank=0, use_fsdp=True, fsdp_version=self.current_fsdp_version
        )
        for config in self.performance_configs[fsdp_version]:
            cmd_config = cmd.copy()
            cmd_config.append(f"--fsdp_version={fsdp_version}")
            for i, strategy in enumerate(FSDP_SHARDING_STRATEGY):
                if fsdp_version == 2 and strategy != "FULL_SHARD":
                    continue
                if strategy.lower() in config:
                    if fsdp_version == 1:
                        cmd_config.append(f"--fsdp_sharding_strategy={strategy}")
                    else:
                        # FSDP2 uses `reshard_after_forward` instead of `sharding_strategy` and is true unless we test `NO_SHARD` (we don't)
                        cmd_config.append("--fsdp_reshard_after_forward=true")
                    break

            if "fp32" in config:
                cmd_config.append("--mixed_precision=no")
            else:
                cmd_config.append("--mixed_precision=fp16")

            if "cpu_offload" in config:
                cmd_config.append("--fsdp_offload_params=True")

            for policy in FSDP_AUTO_WRAP_POLICY:
                if policy.lower() in config:
                    cmd_config.append(f"--fsdp_auto_wrap_policy={policy}")
                    break

            if policy == "TRANSFORMER_BASED_WRAP":
                cmd_config.append("--fsdp_transformer_layer_cls_to_wrap=BertLayer")
            elif policy == "SIZE_BASED_WRAP":
                cmd_config.append("--fsdp_min_num_params=2000")

            cmd_config.extend(
                [
                    self.test_file_path,
                    f"--output_dir={self.tmpdir}",
                    f"--performance_lower_bound={self.performance_lower_bound}",
                ]
            )

            with patch_environment(omp_num_threads=1):
                execute_subprocess_async(cmd_config)

    @require_fp16
    def test_checkpointing(self):
        self.test_file_path = self.test_scripts_folder / "test_checkpointing.py"
        fsdp_version = self.current_fsdp_version
        cmd = get_launch_command(
            num_processes=2,
            num_machines=1,
            machine_rank=0,
            use_fsdp=True,
            mixed_precision="fp16",
            fsdp_transformer_layer_cls_to_wrap="BertLayer",
            fsdp_version=fsdp_version,
        )

        for i, strategy in enumerate(FSDP_SHARDING_STRATEGY):
            fsdp_state_dict_types = FSDP_STATE_DICT_TYPE if fsdp_version == 1 else FSDP2_STATE_DICT_TYPE
            cmd_config = cmd.copy()
            if fsdp_version == 1:
                cmd_config.append(f"--fsdp_sharding_strategy={strategy}")
            else:
                cmd_config.append("--fsdp_reshard_after_forward=true")
            if strategy != "FULL_SHARD":
                continue
            state_dict_config_index = len(cmd_config)
            for state_dict_type in fsdp_state_dict_types:
                # Todo: Currently failing for `LOCAL_STATE_DICT` with error
                # Unexpected key(s) in state_dict: "_fsdp_wrapped_module._flat_param".
                if state_dict_type == "LOCAL_STATE_DICT":
                    continue

                cmd_config = cmd_config[:state_dict_config_index]
                cmd_config.append(f"--fsdp_state_dict_type={state_dict_type}")
                cmd_config.extend(
                    [
                        self.test_file_path,
                        f"--output_dir={self.tmpdir}",
                        "--partial_train_epoch=1",
                    ]
                )
                with patch_environment(omp_num_threads=1):
                    execute_subprocess_async(cmd_config)

                cmd_config = cmd_config[:-1]
                resume_from_checkpoint = os.path.join(self.tmpdir, "epoch_0")
                cmd_config.extend(
                    [
                        f"--resume_from_checkpoint={resume_from_checkpoint}",
                    ]
                )
                with patch_environment(omp_num_threads=1):
                    execute_subprocess_async(cmd_config)

    @require_fp16
    def test_peak_memory_usage(self):
        self.test_file_path = self.test_scripts_folder / "test_peak_memory_usage.py"
        fsdp_version = self.current_fsdp_version
        cmd = get_launch_command(num_processes=2, num_machines=1, machine_rank=0, fsdp_version=fsdp_version)
        for spec, peak_mem_upper_bound in self.peak_memory_usage_upper_bound[fsdp_version].items():
            cmd_config = cmd.copy()
            if "fp16" in spec:
                cmd_config.extend(["--mixed_precision=fp16"])
            else:
                cmd_config.extend(["--mixed_precision=no"])

            if "multi_gpu" in spec:
                continue
            else:
                cmd_config.extend(["--use_fsdp"])
                for i, strategy in enumerate(FSDP_SHARDING_STRATEGY):
                    if fsdp_version == 2 and strategy != "FULL_SHARD":
                        continue
                    if strategy.lower() in spec:
                        if fsdp_version == 1:
                            cmd_config.append(f"--fsdp_sharding_strategy={strategy}")
                        else:
                            cmd_config.append("--fsdp_reshard_after_forward=true")
                        break

                if "cpu_offload" in spec:
                    cmd_config.append("--fsdp_offload_params=True")

                for policy in FSDP_AUTO_WRAP_POLICY:
                    if policy.lower() in spec:
                        cmd_config.append(f"--fsdp_auto_wrap_policy={policy}")
                        break

                if policy == "TRANSFORMER_BASED_WRAP":
                    cmd_config.append("--fsdp_transformer_layer_cls_to_wrap=BertLayer")
                elif policy == "SIZE_BASED_WRAP":
                    cmd_config.append("--fsdp_min_num_params=2000")

            cmd_config.extend(
                [
                    self.test_file_path,
                    f"--output_dir={self.tmpdir}",
                    f"--peak_memory_upper_bound={peak_mem_upper_bound}",
                    f"--n_train={self.n_train}",
                    f"--n_val={self.n_val}",
                ]
            )
            with patch_environment(omp_num_threads=1):
                execute_subprocess_async(cmd_config)


@require_fsdp2
@run_first
# Skip this test when TorchXLA is available because accelerate.launch does not support TorchXLA FSDP.
@require_non_torch_xla
@require_multi_device
@slow
class FSDP2IntegrationTest(FSDPIntegrationTest):
    def setUp(self):
        super().setUp()
        self.current_fsdp_version = 2


================================================
FILE: tests/test_accelerator.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools
import json
import os
import pickle
import tempfile
import time
from unittest import skip
from unittest.mock import patch

import psutil
import torch
from parameterized import parameterized
from torch.utils.data import DataLoader, TensorDataset

from accelerate import DistributedType, infer_auto_device_map, init_empty_weights, load_checkpoint_and_dispatch
from accelerate.accelerator import Accelerator
from accelerate.data_loader import DataLoaderDispatcher, DataLoaderShard, skip_first_batches
from accelerate.state import GradientState, PartialState
from accelerate.test_utils import (
    require_bnb,
    require_cuda_or_xpu,
    require_fp8,
    require_fp16,
    require_huggingface_suite,
    require_multi_device,
    require_non_cpu,
    require_non_hpu,
    require_transformer_engine,
    slow,
    torch_device,
)
from accelerate.test_utils.testing import (
    AccelerateTestCase,
    assert_exception,
    require_cuda,
    require_non_torch_xla,
    require_torchdata_stateful_dataloader,
)
from accelerate.utils import FP8RecipeKwargs, is_torchdata_stateful_dataloader_available, patch_environment
from accelerate.utils.dataclasses import DataLoaderConfiguration
from accelerate.utils.modeling import get_state_dict_from_offload, load_checkpoint_in_model
from accelerate.utils.random import set_seed


if is_torchdata_stateful_dataloader_available():
    from torchdata.stateful_dataloader import StatefulDataLoader


class ModelWithTiedWeights(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = torch.nn.Linear(2, 4)
        self.linear2 = torch.nn.Linear(4, 2)
        self.linear2.weight = self.linear1.weight
        self.linear2.bias = self.linear1.bias

    def forward(self, x):
        return self.linear2(self.linear1(x))


def create_components(tied_weights=False):
    model = ModelWithTiedWeights() if tied_weights else torch.nn.Linear(2, 4)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1.0)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.01, steps_per_epoch=2, epochs=1)
    train_dl = DataLoader(TensorDataset(torch.tensor([1, 2, 3])))
    valid_dl = DataLoader(TensorDataset(torch.tensor([4, 5, 6])))
    return model, optimizer, scheduler, train_dl, valid_dl


class ModelForTest(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = torch.nn.Linear(3, 4)
        self.batchnorm = torch.nn.BatchNorm1d(4)
        self.linear2 = torch.nn.Linear(4, 5)

    def forward(self, x):
        return self.linear2(self.batchnorm(self.linear1(x)))


def create_dataloaders_for_test(batch_size=3, n_train_batches: int = 12, n_valid_batches: int = 2, num_workers=0):
    "Generates a tuple of dummy DataLoaders to test with"

    def get_dataset(n_batches):
        x = torch.randn(batch_size * n_batches, 3)
        y = torch.randn(batch_size * n_batches, 5)
        return TensorDataset(x, y)

    train_dataset = get_dataset(n_train_batches)
    valid_dataset = get_dataset(n_valid_batches)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers)
    valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, num_workers=num_workers)
    return (train_dataloader, valid_dataloader)


def get_signature(model):
    return sum(param.abs().sum().item() for param in model.parameters())


def load_random_weights(model):
    if isinstance(model, torch.nn.Linear):
        state = torch.nn.Linear(*tuple(model.weight.T.shape)).state_dict()
    elif isinstance(model, ModelWithTiedWeights):
        state = ModelWithTiedWeights().state_dict()
    model.load_state_dict(state)


def parameterized_custom_name_func(func, param_num, param):
    # customize the test name generator function as we want both params to appear in the sub-test
    # name, as by default it shows only the first param
    param_based_name = "use_safetensors" if param.args[0] is True else "use_pytorch"
    if len(param.args) > 1:
        param_based_name += "_tied_weights" if param.args[1] is True else ""
    if len(param.args) > 2:
        param_based_name += f"_num_workers_{param.args[2]}"
    if len(param.args) > 3:
        param_based_name += "_dispatch_batches" if param.args[3] is True else "_no_dispatch_batches"
    return f"{func.__name__}_{param_based_name}"


class AcceleratorTester(AccelerateTestCase):
    def test_partial_state_after_reset(self):
        # Verifies that custom getattr errors will be thrown
        # if the state is reset, but only if trying to
        # get expected attributes
        state = PartialState()
        assert state.num_processes > 0

        with self.assertRaises(AttributeError) as cm:
            state.someotherthing
        assert "'PartialState' object has no attribute" in str(cm.exception)
        assert "This happens if `PartialState._reset_state()`" not in str(cm.exception)

        with self.assertRaises(AttributeError) as cm:
            state._reset_state()
            state.num_processes
        assert "`PartialState` object has no attribute" in str(cm.exception)
        assert "This happens if `PartialState._reset_state()`" in str(cm.exception)

        state.someotherthing = "MyValue"
        assert state.someotherthing == "MyValue"

    def test_accelerator_state_after_reset(self):
        # Verifies that custom getattr errors will be thrown
        # if the state is reset, but only if trying to
        # get expected attributes
        accelerator = Accelerator()
        assert accelerator.num_processes > 0

        with self.assertRaises(AttributeError) as cm:
            accelerator.state.someotherthing
        assert "'AcceleratorState' object has no attribute" in str(cm.exception)
        assert "This happens if `AcceleratorState._reset_state()`" not in str(cm.exception)

        with self.assertRaises(AttributeError) as cm:
            accelerator.state._reset_state()
            accelerator.num_processes
        assert "`AcceleratorState` object has no attribute" in str(cm.exception)
        assert "This happens if `AcceleratorState._reset_state()`" in str(cm.exception)

        accelerator.state.someotherthing = "MyValue"
        assert accelerator.state.someotherthing == "MyValue"

    @require_non_cpu
    def test_accelerator_can_be_reinstantiated(self):
        _ = Accelerator()
        assert PartialState._shared_state["_cpu"] is False
        assert PartialState._shared_state["device"].type in ["cuda", "mps", "npu", "xpu", "xla", "hpu"]
        with self.assertRaises(ValueError):
            _ = Accelerator(cpu=True)

    @require_cuda
    def test_setting_cpu_affinity(self):
        with patch_environment(accelerate_cpu_affinity=1, accelerate_debug_mode=1):
            with self.assertLogs("accelerate.utils.environment", level="INFO") as cm:
                _ = Accelerator()
                assert any("Assigning" in log for log in cm.output)
                assert any("cpu cores to process" in log for log in cm.output)

    def test_mutable_states(self):
        accelerator = Accelerator()
        state = GradientState()
        assert state.num_steps == 1
        accelerator.gradient_accumulation_steps = 4
        assert state.num_steps == 4

        assert state.sync_gradients is True
        accelerator.sync_gradients = False
        assert state.sync_gradients is False
        GradientState._reset_state()

    def test_prepared_objects_are_referenced(self):
        accelerator = Accelerator()
        model, optimizer, scheduler, train_dl, valid_dl = create_components()

        (
            prepared_model,
            prepared_optimizer,
            prepared_scheduler,
            prepared_train_dl,
            prepared_valid_dl,
        ) = accelerator.prepare(model, optimizer, scheduler, train_dl, valid_dl)

        assert prepared_model in accelerator._models
        assert prepared_optimizer in accelerator._optimizers
        assert prepared_scheduler in accelerator._schedulers
        assert prepared_train_dl in accelerator._dataloaders
        assert prepared_valid_dl in accelerator._dataloaders

    @require_non_hpu  # hpu does not support empty_cache
    def test_free_memory_dereferences_prepared_components(self):
        accelerator = Accelerator()
        # Free up refs with empty_cache() and gc.collect()
        accelerator.free_memory()
        model, optimizer, scheduler, train_dl, valid_dl = create_components()
        free_cpu_ram_before = psutil.virtual_memory().available // 1024 // 1024
        model, optimizer, scheduler, train_dl, valid_dl = accelerator.prepare(
            model, optimizer, scheduler, train_dl, valid_dl
        )

        # Short sleep here makes this test more reliable
        time.sleep(1e-3)

        model, optimizer, scheduler, train_dl, valid_dl = accelerator.free_memory(
            model, optimizer, scheduler, train_dl, valid_dl
        )

        free_cpu_ram_after = psutil.virtual_memory().available // 1024 // 1024

        assert len(accelerator._models) == 0
        assert len(accelerator._optimizers) == 0
        assert len(accelerator._schedulers) == 0
        assert len(accelerator._dataloaders) == 0

        # The less-than comes *specifically* from device CPU things/won't be present on CPU builds
        # Allow a small tolerance for OS-level memory fluctuations between measurements
        assert free_cpu_ram_after <= free_cpu_ram_before + 50

    @require_non_torch_xla
    def test_env_var_device(self):
        """Tests that setting the torch device with ACCELERATE_TORCH_DEVICE overrides default device."""
        PartialState._reset_state()

        # Mock torch's set_device call to avoid an exception as the device doesn't exist
        def noop(*args, **kwargs):
            pass

        with (
            patch(f"torch.{torch_device}.set_device", noop),
            patch_environment(ACCELERATE_TORCH_DEVICE=f"{torch_device}:64"),
        ):
            accelerator = Accelerator()
            assert str(accelerator.state.device) == f"{torch_device}:64"

    @parameterized.expand([(True, True), (True, False), (False, False)], name_func=parameterized_custom_name_func)
    def test_save_load_model(self, use_safetensors, tied_weights):
        accelerator = Accelerator()
        model, optimizer, scheduler, train_dl, valid_dl = create_components(tied_weights)
        accelerator.prepare(model, optimizer, scheduler, train_dl, valid_dl)

        model_signature = get_signature(model)

        with tempfile.TemporaryDirectory() as tmpdirname:
            accelerator.save_state(tmpdirname, safe_serialization=use_safetensors)

            # make sure random weights don't match
            load_random_weights(model)
            assert abs(model_signature - get_signature(model)) > 1e-3

            # make sure loaded weights match
            accelerator.load_state(tmpdirname)
            assert abs(model_signature - get_signature(model)) < 1e-3

    @parameterized.expand([True, False], name_func=parameterized_custom_name_func)
    def test_save_model(self, use_safetensors):
        accelerator = Accelerator()
        model = torch.nn.Linear(10, 10)

        model_signature = get_signature(model)
        with tempfile.TemporaryDirectory() as tmpdirname:
            accelerator.save_model(model, tmpdirname, safe_serialization=use_safetensors)
            # make sure loaded weights match
            load_checkpoint_in_model(model, tmpdirname)
            assert abs(model_signature - get_signature(model)) < 1e-3

    @parameterized.expand([True, False], name_func=parameterized_custom_name_func)
    def test_save_sharded_model(self, use_safetensors):
        accelerator = Accelerator()
        inputs = torch.randn(3, 3)
        model = ModelForTest()
        expected = model(inputs)

        with tempfile.TemporaryDirectory() as tmpdirname:
            # By setting it to 100, we will split the model int 3 shards
            accelerator.save_model(model, tmpdirname, safe_serialization=use_safetensors, max_shard_size=100)
            # make sure loaded weights match
            load_checkpoint_in_model(model, tmpdirname)
            output = model(inputs)

        assert torch.allclose(expected, output, atol=1e-5)

    @parameterized.expand([True, False], name_func=parameterized_custom_name_func)
    def test_save_model_offload(self, use_safetensors):
        accelerator = Accelerator()

        device_map = {"linear1": "cpu", "batchnorm": "disk", "linear2": "cpu"}

        inputs = torch.randn(3, 3)
        model = ModelForTest()
        expected = model(inputs)
        with tempfile.TemporaryDirectory() as tmp_dir:
            accelerator.save_model(model, tmp_dir, safe_serialization=use_safetensors)
            # load and save offloaded model
            load_checkpoint_and_dispatch(model, tmp_dir, device_map=device_map, offload_folder=tmp_dir)
            accelerator.save_model(model, tmp_dir, safe_serialization=use_safetensors)

            # load weights that were saved from the offloaded model
            load_checkpoint_and_dispatch(model, tmp_dir)
            output = model(inputs)
        assert torch.allclose(expected, output, atol=1e-5)

    @parameterized.expand([True, False], name_func=parameterized_custom_name_func)
    @require_non_cpu
    def test_get_state_dict_from_offload(self, use_safetensors):
        accelerator = Accelerator()

        device_map = {"linear1": "cpu", "batchnorm": "disk", "linear2": "disk"}
        model = ModelForTest()
        offloaded_layer_weight = model.linear2.weight
        with tempfile.TemporaryDirectory() as tmp_dir:
            accelerator.save_model(model, tmp_dir, safe_serialization=use_safetensors)
            # load model with offloaded layers
            load_checkpoint_and_dispatch(model, tmp_dir, device_map=device_map, offload_folder=tmp_dir)
            cpu_onloaded_layer = get_state_dict_from_offload(
                model.linear2, "linear2.weight", {"linear2.weight": ""}, device_to_put_offload="cpu"
            )
            device_onloaded_layer = get_state_dict_from_offload(
                model.linear2, "linear2.weight", {"linear2.weight": ""}, device_to_put_offload=0
            )
            cpu_onloaded_layer_weight = cpu_onloaded_layer["linear2.weight"]
            device_onloaded_layer_weight = device_onloaded_layer["linear2.weight"]

        assert torch.allclose(offloaded_layer_weight, cpu_onloaded_layer_weight)
        assert torch.allclose(
            offloaded_layer_weight, device_onloaded_layer_weight.to("cpu")
        )  # must be on the same device for torch.allclose()
        assert cpu_onloaded_layer_weight.device.type == "cpu"
        assert device_onloaded_layer_weight.device.type == torch_device

    @parameterized.expand([True, False], name_func=parameterized_custom_name_func)
    def test_save_load_model_with_hooks(self, use_safetensors):
        accelerator = Accelerator()
        model, optimizer, scheduler, train_dl, valid_dl = create_components()
        accelerator.prepare(model, optimizer, scheduler, train_dl, valid_dl)

        model_signature = get_signature(model)

        # saving hook
        def save_config(models, weights, output_dir):
            config = {"class_name": models[0].__class__.__name__}

            with open(os.path.join(output_dir, "data.json"), "w") as f:
                json.dump(config, f)

        # loading hook
        def load_config(models, input_dir):
            with open(os.path.join(input_dir, "data.json")) as f:
                config = json.load(f)

            models[0].class_name = config["class_name"]

        save_hook = accelerator.register_save_state_pre_hook(save_config)
        load_hook = accelerator.register_load_state_pre_hook(load_config)

        with tempfile.TemporaryDirectory() as tmpdirname:
            accelerator.save_state(tmpdirname, safe_serialization=use_safetensors)

            # make sure random weights don't match with hooks
            load_random_weights(model)
            assert abs(model_signature - get_signature(model)) > 1e-3

            # random class name to verify correct one is loaded
            model.class_name = "random"

            # make sure loaded weights match with hooks
            accelerator.load_state(tmpdirname)
            assert abs(model_signature - get_signature(model)) < 1e-3

            # mode.class_name is loaded from config
            assert model.class_name == model.__class__.__name__

        # remove hooks
        save_hook.remove()
        load_hook.remove()

        with tempfile.TemporaryDirectory() as tmpdirname:
            accelerator.save_state(tmpdirname, safe_serialization=use_safetensors)

            # make sure random weights don't match with hooks removed
            load_random_weights(model)
            assert abs(model_signature - get_signature(model)) > 1e-3

            # random class name to verify correct one is loaded
            model.class_name = "random"

            # make sure loaded weights match with hooks removed
            accelerator.load_state(tmpdirname)
            assert abs(model_signature - get_signature(model)) < 1e-3

            # mode.class_name is NOT loaded from config
            assert model.class_name != model.__class__.__name__

    def test_accelerator_none(self):
        """Just test that passing None to accelerator.prepare() works."""
        accelerator = Accelerator()
        model, optimizer, scheduler, train_dl, valid_dl = create_components()
        dummy_obj = None

        # This should work
        model, optimizer, scheduler, train_dl, valid_dl, dummy_obj = accelerator.prepare(
            model, optimizer, scheduler, train_dl, valid_dl, dummy_obj
        )
        assert dummy_obj is None

    def test_is_accelerator_prepared(self):
        """Checks that `_is_accelerator_prepared` is set properly"""
        accelerator = Accelerator()
        model, optimizer, scheduler, train_dl, valid_dl = create_components()
        dummy_obj = [1, 2, 3]

        # This should work
        model, optimizer, scheduler, train_dl, valid_dl, dummy_obj = accelerator.prepare(
            model, optimizer, scheduler, train_dl, valid_dl, dummy_obj
        )
        assert getattr(dummy_obj, "_is_accelerate_prepared", False) is False, (
            "Dummy object should have `_is_accelerate_prepared` set to `True`"
        )
        assert getattr(model, "_is_accelerate_prepared", False) is True, (
            "Model is missing `_is_accelerator_prepared` or is set to `False`"
        )
        assert getattr(optimizer, "_is_accelerate_prepared", False) is True, (
            "Optimizer is missing `_is_accelerator_prepared` or is set to `False`"
        )
        assert getattr(scheduler, "_is_accelerate_prepared", False) is True, (
            "Scheduler is missing `_is_accelerator_prepared` or is set to `False`"
        )
        assert getattr(train_dl, "_is_accelerate_prepared", False) is True, (
            "Train Dataloader is missing `_is_accelerator_prepared` or is set to `False`"
        )
        assert getattr(valid_dl, "_is_accelerate_prepared", False) is True, (
            "Valid Dataloader is missing `_is_accelerator_prepared` or is set to `False`"
        )

    @require_cuda_or_xpu
    @slow
    @require_bnb
    def test_accelerator_bnb(self):
        """Tests that the accelerator can be used with the BNB library."""
        from transformers import AutoModelForCausalLM, BitsAndBytesConfig

        model = AutoModelForCausalLM.from_pretrained(
            "EleutherAI/gpt-neo-125m",
            quantization_config=BitsAndBytesConfig(load_in_8bit=True),
            device_map={"": 0},
        )
        accelerator = Accelerator()

        # This should work
        model = accelerator.prepare(model)

    @require_cuda_or_xpu
    @slow
    @require_bnb
    @skip("Passing locally but not on CI. Also no one will try to train an offloaded bnb model")
    def test_accelerator_bnb_cpu_error(self):
        """Tests that the accelerator can be used with the BNB library. This should fail as we are trying to load a model
        that is loaded between cpu and gpu"""
        from transformers import AutoModelForCausalLM

        accelerator = Accelerator()

        with init_empty_weights():
            model = AutoModelForCausalLM.from_pretrained(
                "EleutherAI/gpt-neo-125m",
            )
            model.tie_weights()
            device_map = infer_auto_device_map(model)
            device_map["lm_head"] = "cpu"

        from transformers import BitsAndBytesConfig

        model = AutoModelForCausalLM.from_pretrained(
            "EleutherAI/gpt-neo-125m",
            device_map=device_map,
            quantization_config=BitsAndBytesConfig(load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=True),
        )

        # This should not work and get value error
        with self.assertRaises(ValueError):
            model = accelerator.prepare(model)

    @require_non_torch_xla
    @require_non_hpu  # bnb is not supported on HPU
    @slow
    @require_bnb
    @require_multi_device
    def test_accelerator_bnb_multi_device(self):
        """Tests that the accelerator can be used with the BNB library."""
        from transformers import AutoModelForCausalLM, BitsAndBytesConfig

        if torch_device == "cuda":
            PartialState._shared_state = {"distributed_type": DistributedType.MULTI_GPU}
        elif torch_device == "npu":
            PartialState._shared_state = {"distributed_type": DistributedType.MULTI_NPU}
        elif torch_device == "xpu":
            PartialState._shared_state = {"distributed_type": DistributedType.MULTI_XPU}
        else:
            raise ValueError(f"{torch_device} is not supported in test_accelerator_bnb_multi_device.")

        with init_empty_weights():
            model = AutoModelForCausalLM.from_pretrained(
                "EleutherAI/gpt-neo-125m",
            )
            model.tie_weights()
            device_map = infer_auto_device_map(model)
            device_map["lm_head"] = 1

        model = AutoModelForCausalLM.from_pretrained(
            "EleutherAI/gpt-neo-125m",
            quantization_config=BitsAndBytesConfig(load_in_8bit=True),
            device_map=device_map,
        )
        accelerator = Accelerator()

        # This should not work and get value error
        with self.assertRaises(ValueError):
            _ = accelerator.prepare(model)

    @require_non_torch_xla
    @require_non_hpu  # bnb is not supported on HPU
    @slow
    @require_bnb
    @require_multi_device
    def test_accelerator_bnb_multi_device_no_distributed(self):
        """Tests that the accelerator can be used with the BNB library."""
        from transformers import AutoModelForCausalLM, BitsAndBytesConfig

        with init_empty_weights():
            model = AutoModelForCausalLM.from_pretrained(
                "EleutherAI/gpt-neo-125m",
            )
            device_map = infer_auto_device_map(model)
            device_map["lm_head"] = 1

        model = AutoModelForCausalLM.from_pretrained(
            "EleutherAI/gpt-neo-125m",
            quantization_config=BitsAndBytesConfig(load_in_8bit=True),
            device_map=device_map,
        )
        accelerator = Accelerator()

        # This should work
        _ = accelerator.prepare(model)

    @require_non_cpu
    def test_accelerator_cpu_flag_prepare(self):
        model = torch.nn.Linear(10, 10)
        sgd = torch.optim.SGD(model.parameters(), lr=0.01)
        accelerator = Accelerator(cpu=True)
        _ = accelerator.prepare(sgd)

    @require_fp8
    @require_transformer_engine
    def test_can_unwrap_model_te(self):
        model, optimizer, *_ = create_components()
        fp8_recipe = FP8RecipeKwargs(backend="TE")
        accelerator = Accelerator(mixed_precision="fp8", kwargs_handlers=[fp8_recipe])
        inputs = torch.randn(10, 2).to(torch_device)
        model, optimizer = accelerator.prepare(model, optimizer)
        model(inputs)  # sanity check that this works

        model = accelerator.unwrap_model(model, keep_fp32_wrapper=False)
        model(inputs)  # check that this still works

        # check that pickle roundtrip works
        model_loaded = pickle.loads(pickle.dumps(model))
        model_loaded(inputs)

    @require_fp16
    @require_non_cpu
    def test_can_unwrap_model_fp16(self):
        # test for a regression introduced in #872
        # before the fix, after unwrapping with keep_fp32_wrapper=False, there would be the following error:
        # Linear.forward() missing 1 required positional argument: 'input'
        model = create_components()[0]
        accelerator = Accelerator(mixed_precision="fp16")
        inputs = torch.randn(10, 2).to(torch_device)
        model = accelerator.prepare(model)
        model(inputs)  # sanity check that this works

        model = accelerator.unwrap_model(model, keep_fp32_wrapper=False)
        model(inputs)  # check that this still works

        # check that pickle roundtrip works
        model_loaded = pickle.loads(pickle.dumps(model))
        model_loaded(inputs)

    def test_can_unwrap_model(self):
        model = create_components()[0]
        accelerator = Accelerator(mixed_precision="no", cpu=True)
        inputs = torch.randn(10, 2)
        model = accelerator.prepare(model)
        model(inputs)  # sanity check that this works

        model = accelerator.unwrap_model(model, keep_fp32_wrapper=False)
        model(inputs)  # check that this still works

        # check that pickle roundtrip works
        model_loaded = pickle.loads(pickle.dumps(model))
        model_loaded(inputs)

    def test_can_unwrap_distributed_compiled_model_keep_torch_compile(self):
        model = create_components()[0]
        accelerator = Accelerator()

        compiled_model = torch.compile(model)

        distributed_model = torch.nn.DataParallel(model)
        distributed_compiled_model = torch.compile(distributed_model)
        unwrapped_model = accelerator.unwrap_model(distributed_compiled_model, keep_torch_compile=True)

        assert compiled_model._orig_mod == unwrapped_model._orig_mod

    def test_can_unwrap_distributed_compiled_model_remove_torch_compile(self):
        model = create_components()[0]
        accelerator = Accelerator()

        compiled_model = torch.compile(model)

        distributed_model = torch.nn.DataParallel(model)
        distributed_compiled_model = torch.compile(distributed_model)
        unwrapped_model = accelerator.unwrap_model(distributed_compiled_model, keep_torch_compile=False)

        assert compiled_model._orig_mod == unwrapped_model

    @parameterized.expand([True, False])
    def test_can_pickle_dataloader(self, dispatch_batches):
        """
        Test that pickling a prepared dataloader works.
        """
        data = torch.arange(10).to(torch_device)
        ds = torch.utils.data.TensorDataset(data)
        dl = torch.utils.data.DataLoader(ds)
        skip_dl = skip_first_batches(dl, 2)

        # Currently, StatefulDataLoader doesn't seem to support pickling, so we aren't testing that functionality
        # TODO: Add support for pickling StatefulDataLoader
        dataloader_config = DataLoaderConfiguration(dispatch_batches=dispatch_batches, use_stateful_dataloader=False)
        accelerator = Accelerator(dataloader_config=dataloader_config)

        original_dl, _ = accelerator.prepare(dl, skip_dl)
        if dispatch_batches:
            assert isinstance(original_dl, DataLoaderDispatcher)
        else:
            assert isinstance(original_dl, DataLoaderShard)

        prepared_model_dumps = pickle.dumps(accelerator)

        model_loaded = pickle.loads(prepared_model_dumps)
        assert len(model_loaded._dataloaders) == 2

        # Assert equality of recovered and original dataloader
        loaded_dl = model_loaded._dataloaders[0]
        assert isinstance(loaded_dl, DataLoader)
        if dispatch_batches:
            assert isinstance(loaded_dl, DataLoaderDispatcher)
        else:
            assert isinstance(loaded_dl, DataLoaderShard)
        assert len(loaded_dl) == len(original_dl)
        assert [i for i in loaded_dl] == [i for i in original_dl]

        # Test skip dataloader works as expected as well
        loaded_skip_dl = model_loaded._dataloaders[1]
        assert isinstance(loaded_skip_dl, DataLoader)
        if dispatch_batches:
            assert isinstance(loaded_dl, DataLoaderDispatcher)
        else:
            assert isinstance(loaded_dl, DataLoaderShard)
        assert len(loaded_skip_dl) == len(original_dl) - 2
        assert [i for i in loaded_skip_dl] == [i for i in original_dl][2:]

    # Ideally would be a parameterized test which works with either stateful or non-stateful dataloaders, but dependencies are a bit awkward.
    @require_torchdata_stateful_dataloader
    def test_prepared_objects_are_referenced_with_stateful_dataloader(self):
        """Test that setting `use_stateful_dataloader=True` in `DataLoaderConfiguration` prepares a `StatefulDataLoader` object instead of a `DataLoader` object."""
        dataloader_config = DataLoaderConfiguration(use_stateful_dataloader=True)
        accelerator = Accelerator(dataloader_config=dataloader_config)
        model, optimizer, scheduler, train_dl, valid_dl = create_components()

        (
            prepared_model,
            prepared_optimizer,
            prepared_scheduler,
            prepared_train_dl,
            prepared_valid_dl,
        ) = accelerator.prepare(model, optimizer, scheduler, train_dl, valid_dl)

        assert prepared_model in accelerator._models
        assert prepared_optimizer in accelerator._optimizers
        assert prepared_scheduler in accelerator._schedulers
        assert prepared_train_dl in accelerator._dataloaders
        assert prepared_valid_dl in accelerator._dataloaders
        assert isinstance(prepared_train_dl, StatefulDataLoader)
        assert isinstance(prepared_valid_dl, StatefulDataLoader)

    @parameterized.expand(
        itertools.product([True, False], [True, False], [0, 2], [True, False]),
        name_func=parameterized_custom_name_func,
    )
    @require_torchdata_stateful_dataloader
    def test_save_model_with_stateful_dataloader(self, use_safetensors, tied_weights, num_workers, dispatch_batches):
        """
        Test that saving and loading a model with a stateful dataloader returns the same model,
        and that the dataloader's iterator is restored properly."""
        set_seed(42)
        n_train_batches = 64  # Use enough batches to ensure we can get partial iterations on large compute
        dataloader_config = DataLoaderConfiguration(dispatch_batches=dispatch_batches, use_stateful_dataloader=True)
        accelerator = Accelerator(dataloader_config=dataloader_config)

        model, optimizer, scheduler, train_dl, valid_dl = create_components(tied_weights)
        train_dl, valid_dl = create_dataloaders_for_test(n_train_batches=n_train_batches, num_workers=num_workers)
        model = ModelForTest()

        (
            prepared_model,
            prepared_optimizer,
            prepared_scheduler,
            prepared_train_dl,
            prepared_valid_dl,
        ) = accelerator.prepare(model, optimizer, scheduler, train_dl, valid_dl)

        assert isinstance(prepared_train_dl, StatefulDataLoader)
        assert isinstance(prepared_valid_dl, StatefulDataLoader)

        # Perform 3 training iterations to ensure the dataloader's iterator is advanced
        num_batches_to_skip = 3
        model.train()
        untrained_batches = []
        with tempfile.TemporaryDirectory() as tmpdirname:
            for step, batch in enumerate(prepared_train_dl):
                x, y = batch
                outputs = prepared_model(x)
                loss = torch.nn.functional.mse_loss(outputs, y)
                accelerator.backward(loss)
                prepared_optimizer.step()
                prepared_scheduler.step()
                prepared_optimizer.zero_grad()
                if step == num_batches_to_skip - 1:
                    # Save the state once we've gone through a few batches
                    accelerator.save_state(f"{tmpdirname}/state", safe_serialization=use_safetensors)
                if step >= num_batches_to_skip:
                    untrained_batches.append(batch)

            not_skipped_batches = accelerator.gather(untrained_batches)
            # We then unwrap the trained model
            unwrapped_model = accelerator.unwrap_model(prepared_model)

            original_linear1 = unwrapped_model.linear1.weight.clone()
            original_batchnorm = unwrapped_model.batchnorm.weight.clone()
            original_linear2 = unwrapped_model.linear2.weight.clone()

            # Resume the state
            accelerator.load_state(f"{tmpdirname}/state")

            # Train this to the end of the DataLoader
            batches_seen_with_loaded_dl = 0
            for batch in prepared_train_dl:
                x, y = batch
                outputs = prepared_model(x)
                loss = torch.nn.functional.mse_loss(outputs, y)
                accelerator.backward(loss)
                prepared_optimizer.step()
                prepared_scheduler.step()
                prepared_optimizer.zero_grad()
                batches_seen_with_loaded_dl += 1

            unwrapped_model_2 = accelerator.unwrap_model(prepared_model)

            new_linear1 = unwrapped_model_2.linear1.weight
            new_batchnorm = unwrapped_model_2.batchnorm.weight
            new_linear2 = unwrapped_model_2.linear2.weight

            # Assert equalities
            assert batches_seen_with_loaded_dl == len(not_skipped_batches)
            assert torch.allclose(original_linear1, new_linear1)
            assert torch.allclose(original_batchnorm, new_batchnorm)
            assert torch.allclose(original_linear2, new_linear2)

    @require_non_cpu
    @require_huggingface_suite
    def test_nested_hook(self):
        from transformers.modeling_utils import PretrainedConfig, PreTrainedModel

        class MyLinear(torch.nn.Module):
            def __init__(self, device=None, dtype=None):
                factory_kwargs = {"device": device, "dtype": dtype}
                super().__init__()
                self.centroid = torch.nn.Embedding(1, 2)
                self.indices = torch.nn.Parameter(torch.empty((1, 2, 2), **factory_kwargs))

            def forward(self, x):
                orig_shape = x.shape
                x = torch.abs(x + self.indices).long()
                x = x % 2
                x = x.sum(-1)
                x = (self.centroid.weight + x).reshape(orig_shape)
                return x

        class MySubModel(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.layer = MyLinear()

            def forward(self, x):
                return self.layer(x)

        class MyModel(PreTrainedModel):
            def __init__(self, config):
                super().__init__(config)
                self.layer = torch.nn.ModuleList([MySubModel() for i in range(4)])

            def forward(self, x):
                for layer in self.layer:
                    x = layer(x)
                return x

        with tempfile.TemporaryDirectory() as tmpdirname:
            check_point = tmpdirname
            offload_folder = check_point + "/offload"
            os.makedirs(offload_folder, exist_ok=True)
            config = PretrainedConfig()
            m = MyModel(config)
            m.save_pretrained(check_point)

            with init_empty_weights():
                my_model = MyModel(config)
            my_model = load_checkpoint_and_dispatch(
                my_model,
                checkpoint=check_point,
                max_memory={"cpu": 60, 0: 60},
                device_map="auto",
                no_split_module_classes=["MySubModel"],
                offload_folder=offload_folder,
                preload_module_classes=["MyLinear"],
            )
            # before fix, this would raise an error
            #       weight is on the meta device, we need a `value` to put in on 0
            x = torch.randn(1, 2)
            my_model(x)

    @require_non_torch_xla
    def test_prepare_model_8bit_cpu_offload_raises_valueerror_not_typeerror(self):
        class ModelForTest(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.l = torch.nn.Linear(2, 2)

            def forward(self, x):
                return self.l(x)

        accelerator = Accelerator()
        model = ModelForTest()

        # Trigger the 8-bit/4-bit + hf_device_map code path.
        model.is_loaded_in_8bit = True
        model.hf_device_map = {"": "cpu"}

        with (
            patch("accelerate.accelerator.is_bitsandbytes_multi_backend_available", return_value=False),
            patch("accelerate.accelerator.is_xpu_available", return_value=False),
        ):
            with assert_exception(ValueError, "CPU or disk offload"):
                accelerator.prepare_model(model)


================================================
FILE: tests/test_big_modeling.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import gc
import itertools
import logging
import os
import unittest
from collections import OrderedDict
from tempfile import TemporaryDirectory

import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer

from accelerate.big_modeling import (
    cpu_offload,
    cpu_offload_with_hook,
    disk_offload,
    dispatch_model,
    init_empty_weights,
    init_on_device,
    load_checkpoint_and_dispatch,
)
from accelerate.hooks import remove_hook_from_submodules
from accelerate.test_utils import (
    require_bnb,
    require_cuda_or_xpu,
    require_multi_device,
    require_multi_gpu_or_xpu,
    require_non_cpu,
    require_non_hpu,
    require_non_torch_xla,
    slow,
    torch_device,
)
from accelerate.utils import is_hpu_available, offload_state_dict
from accelerate.utils.memory import clear_device_cache
from accelerate.utils.versions import is_torch_version


logger = logging.getLogger(__name__)
torch_device_type = torch_device
torch_device = f"{torch_device}:0" if torch_device != "cpu" else "cpu"

if is_hpu_available():
    ATOL = 1e-4
    RTOL = 1e-4
else:
    ATOL = 1e-5
    RTOL = 1e-5


class ModelForTest(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = nn.Linear(3, 4)
        self.batchnorm = nn.BatchNorm1d(4)
        self.linear2 = nn.Linear(4, 5)

    def forward(self, x):
        return self.linear2(self.batchnorm(self.linear1(x)))


class LinearWithNonPersistentBuffers(nn.Module):
    def __init__(self, in_features: int, out_features: int, bias: bool = True, device=None, dtype=None) -> None:
        factory_kwargs = {"device": device, "dtype": dtype}
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.register_buffer("weight", torch.ones((out_features, in_features), **factory_kwargs))
        if bias:
            self.register_buffer("bias", torch.ones(out_features, **factory_kwargs), persistent=False)
        else:
            self.register_buffer("bias", None)

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        return torch.nn.functional.linear(input, self.weight, self.bias)


class ModelForTestNonPersistentBuffers(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = LinearWithNonPersistentBuffers(3, 4)
        self.batchnorm = nn.BatchNorm1d(4)
        self.linear2 = LinearWithNonPersistentBuffers(4, 5)

    def forward(self, x):
        return self.linear2(self.batchnorm(self.linear1(x)))


class ModelForTestCopy(nn.Module):
    def __init__(self, id: int):
        super().__init__()
        self.id = id
        self.linear1 = nn.Linear(3, 4)
        self.batchnorm = nn.BatchNorm1d(4)
        self.linear2 = nn.Linear(4, 5)

    def forward(self, x):
        return self.linear2(self.batchnorm(self.linear1(x))), self.id


class ModelForTestTiedWeights(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = nn.Linear(4, 4)
        self.batchnorm = nn.BatchNorm1d(4)
        self.linear2 = nn.Linear(4, 4)

    def forward(self, x):
        return self.linear2(self.batchnorm(self.linear1(x)))


class BiggerModelForTest(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = nn.Linear(3, 4)
        self.linear2 = nn.Linear(4, 5)
        self.batchnorm = nn.BatchNorm1d(5)
        self.linear3 = nn.Linear(5, 6)
        self.linear4 = nn.Linear(6, 5)

    def forward(self, x):
        return self.linear4(self.linear3(self.batchnorm(self.linear2(self.linear1(x)))))


# To test preload_module_classes
class ModuleWithUnusedSubModules(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return x @ self.linear.weight.t() + self.linear.bias


class ModelWithUnusedSubModulesForTest(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = ModuleWithUnusedSubModules(3, 4)
        self.linear2 = ModuleWithUnusedSubModules(4, 5)
        self.batchnorm = nn.BatchNorm1d(5)
        self.linear3 = ModuleWithUnusedSubModules(5, 6)
        self.linear4 = ModuleWithUnusedSubModules(6, 5)

    def forward(self, x):
        return self.linear4(self.linear3(self.batchnorm(self.linear2(self.linear1(x)))))


class BigModelingTester(unittest.TestCase):
    def test_init_empty_weights(self):
        # base use
        with init_empty_weights():
            module = nn.Linear(4, 5)
        assert module.weight.device == torch.device("meta")

        # base use with buffers, they are not touched
        with init_empty_weights():
            module = nn.BatchNorm1d(4)
        assert module.weight.device == torch.device("meta")
        assert module.running_mean.device == torch.device("cpu")

        # Use with include_buffers=True
        register_parameter_func = nn.Module.register_parameter
        register_buffer_func = nn.Module.register_buffer
        with init_empty_weights(include_buffers=True):
            module = nn.BatchNorm1d(4)
            # nn.Module.register_parameter/buffer shouldn't be changed with torch >= 2.0
            assert register_parameter_func == nn.Module.register_parameter
            assert register_buffer_func == nn.Module.register_buffer
        assert module.weight.device == torch.device("meta")
        assert module.running_mean.device == torch.device("meta")

        # Double check we didn't break PyTorch
        module = nn.BatchNorm1d(4)
        assert module.weight.device == torch.device("cpu")
        assert module.running_mean.device == torch.device("cpu")

    def test_init_empty_weights_very_large_model(self):
        # This is a 100 billion parameters model.
        with init_empty_weights():
            _ = nn.Sequential(*[nn.Linear(10000, 10000) for _ in range(1000)])

    @require_non_cpu
    def test_init_on_device(self):
        device = torch.device(torch_device)
        with init_on_device(device):
            model = nn.Linear(10, 10)
        assert model.weight.device == device
        assert model.weight.device == device

    def test_cpu_offload(self):
        model = ModelForTest()
        x = torch.randn(2, 3)
        expected = model(x)

        device = torch.device(torch_device)

        cpu_offload(model, execution_device=device)
        output = model(x)
        torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

        # Clean up for next test.
        remove_hook_from_submodules(model)

        cpu_offload(model, execution_device=device, offload_buffers=True)
        output = model(x)
        torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

    def test_cpu_offload_with_unused_submodules(self):
        model = ModelWithUnusedSubModulesForTest()
        x = torch.randn(2, 3)
        expected = model(x)

        device = torch.device(torch_device)

        cpu_offload(model, execution_device=device, preload_module_classes=["ModuleWithUnusedSubModules"])
        output = model(x)
        torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

        # Clean up for next test.
        remove_hook_from_submodules(model)

        cpu_offload(
            model,
            execution_device=device,
            offload_buffers=True,
            preload_module_classes=["ModuleWithUnusedSubModules"],
        )
        output = model(x)
        torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

    @slow
    @require_non_cpu
    def test_cpu_offload_gpt2(self):
        tokenizer = AutoTokenizer.from_pretrained("gpt2")
        inputs = tokenizer("Hello world! My name is", return_tensors="pt").to(torch_device)

        gpt2 = AutoModelForCausalLM.from_pretrained("gpt2")
        cpu_offload(gpt2, execution_device=0)
        outputs = gpt2.generate(inputs["input_ids"], max_new_tokens=10)
        assert tokenizer.decode(outputs[0].tolist()) == "Hello world! My name is Kiyoshi, and I'm a student at"

    def test_disk_offload(self):
        model = ModelForTest()
        x = torch.randn(2, 3)
        expected = model(x)

        device = torch.device(torch_device)

        with TemporaryDirectory() as tmp_dir:
            disk_offload(model, tmp_dir, execution_device=device)
            output = model(x)
            torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

            # Clean up for next test.
            remove_hook_from_submodules(model)

        with TemporaryDirectory() as tmp_dir:
            disk_offload(model, tmp_dir, execution_device=device, offload_buffers=True)
            output = model(x)
            torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

    def test_disk_offload_with_unused_submodules(self):
        model = ModelWithUnusedSubModulesForTest()
        x = torch.randn(2, 3)
        expected = model(x)

        device = torch.device(torch_device)

        with TemporaryDirectory() as tmp_dir:
            disk_offload(
                model, tmp_dir, execution_device=device, preload_module_classes=["ModuleWithUnusedSubModules"]
            )
            output = model(x)
            torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

            # Clean up for next test.
            remove_hook_from_submodules(model)

        with TemporaryDirectory() as tmp_dir:
            disk_offload(
                model,
                tmp_dir,
                execution_device=device,
                offload_buffers=True,
                preload_module_classes=["ModuleWithUnusedSubModules"],
            )
            output = model(x)
            torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

    @slow
    @require_non_cpu
    def test_disk_offload_gpt2(self):
        tokenizer = AutoTokenizer.from_pretrained("gpt2")
        inputs = tokenizer("Hello world! My name is", return_tensors="pt").to(torch_device)

        gpt2 = AutoModelForCausalLM.from_pretrained("gpt2")
        with TemporaryDirectory() as tmp_dir:
            disk_offload(gpt2, tmp_dir, execution_device=0)
            outputs = gpt2.generate(inputs["input_ids"], max_new_tokens=10)
            assert tokenizer.decode(outputs[0].tolist()) == "Hello world! My name is Kiyoshi, and I'm a student at"

    @require_non_cpu
    def test_dispatch_model_and_remove_hook(self):
        model = ModelForTest()
        device_map = {"linear1": "cpu", "batchnorm": "cpu", "linear2": 0}
        x = torch.randn(2, 3)
        expected = model(x)

        with TemporaryDirectory() as tmp_dir:
            dispatch_model(model, device_map, offload_dir=tmp_dir)
            output = model(x)
            remove_hook_from_submodules(model)
            # need to check if we get any warning
            with self.assertLogs(level="WARNING") as cm:
                # We want to assert there are no warnings, but the 'assertLogs' method does not support that.
                # Therefore, we are adding a dummy warning, and then we will assert it is the only warning.
                model.to(torch_device)
                logger.warning("Dummy warning")
            self.assertEqual(len(cm.records), 1)
            self.assertIn(
                "Dummy warning",
                cm.records[0].message,
            )
            output_bis = model(x.to(torch_device))
            torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)
            torch.testing.assert_close(expected, output_bis.cpu(), atol=ATOL, rtol=RTOL)

    @require_non_cpu
    def test_dispatch_model(self):
        model = ModelForTest()
        device_map = {"linear1": "disk", "batchnorm": "cpu", "linear2": 0}

        x = torch.randn(2, 3)
        expected = model(x)

        with TemporaryDirectory() as tmp_dir:
            dispatch_model(model, device_map, offload_dir=tmp_dir)
            output = model(x)
            torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

    @require_non_cpu
    def test_dispatch_model_with_non_persistent_buffers(self):
        model = ModelForTestNonPersistentBuffers()
        device_map = {"linear1": 0, "batchnorm": "cpu", "linear2": "disk"}
        x = torch.randn(2, 3)
        expected = model(x)

        with TemporaryDirectory() as tmp_dir:
            dispatch_model(model, device_map, offload_dir=tmp_dir, offload_buffers=True)
            output = model(x)
            torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

    @require_non_cpu
    def test_dispatch_model_tied_weights(self):
        model = ModelForTestTiedWeights()
        model.linear1.weight = model.linear2.weight
        device_map = {"linear1": 0, "batchnorm": 0, "linear2": 0}

        dispatch_model(model, device_map)
        assert model.linear2.weight is model.linear1.weight

    @require_multi_gpu_or_xpu
    def test_dispatch_model_tied_weights_memory(self):
        # Test that we do not duplicate tied weights at any point during dispatch_model call.

        torch_accelerator_module = getattr(torch, torch_device_type)

        clear_device_cache()  # Needed in case we run several tests in a row.

        model = nn.Sequential(
            OrderedDict(
                [
                    ("linear0", nn.Linear(5000, 5000, bias=False)),
                    ("linear1", nn.Linear(5000, 5000, bias=False)),
                    ("linear2", nn.Linear(5000, 5000, bias=False)),
                    ("linear3", nn.Linear(5000, 5000, bias=False)),
                    ("linear4", nn.Linear(5000, 5000, bias=False)),
                ]
            )
        )
        model.linear2.weight = model.linear0.weight
        model.linear3.weight = model.linear0.weight
        model.linear4.weight = model.linear0.weight

        x = torch.randn(5, 5000)
        with torch.no_grad():
            expected = model(x)

        # We should need only 5000 * 5000 * 32 // 8 * 1e-6 = 100 MB on the device 0 for the four linear weights.
        device_0 = f"{torch_device_type}:0" if torch_device != "cpu" else "cpu"
        device_1 = f"{torch_device_type}:1" if torch_device != "cpu" else "cpu"
        device_map = {
            "linear0": device_0,
            "linear1": device_1,
            "linear2": device_0,
            "linear3": device_0,
            "linear4": device_0,
        }

        # Just to initialize device context.
        a = torch.rand(5).to(device_0)  # noqa: F841

        free_memory_bytes = torch_accelerator_module.mem_get_info(device_0)[0]
        required_memory_bytes = 5000 * 5000 * (32 // 8)

        # Leaving 50 MB of free memory for possible buffers, etc.
        n_vals = (free_memory_bytes - required_memory_bytes - int(50e6)) // (32 // 8)
        foo = torch.rand(n_vals, device=device_0)  # noqa: F841

        # If this does OOM: there is an issue in somewhere in dispatch_model, memory of tied weights is duplicated.
        oom_error = (
            torch.OutOfMemoryError if is_torch_version(">=", "2.5.0") else torch_accelerator_module.OutOfMemoryError
        )
        try:
            dispatch_model(model, device_map)
        except oom_error as e:
            raise oom_error(
                f"OOM error in dispatch_model. This is a bug and should not happen, see test_dispatch_model_tied_weights_memory. {e}"
            )
        except Exception as e:
            raise e

        with torch.no_grad():
            output = model(x)
        torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

    @require_cuda_or_xpu
    def test_dispatch_model_tied_weights_memory_with_nested_offload_cpu(self):
        # Test that we do not duplicate tied weights at any point during dispatch_model call.

        torch_accelerator_module = getattr(torch, torch_device_type)
        clear_device_cache()  # Needed in case we run several tests in a row.

        class SubModule(torch.nn.Module):
            def __init__(self, ref_to_parameter):
                super().__init__()
                self.parameter = ref_to_parameter

            def forward(self, x):
                return x + torch.max(self.parameter)

        class LinearModuleAndSubModule(torch.nn.Linear):
            def __init__(self, in_features, out_features):
                super().__init__(in_features, out_features, bias=False)
                self.weight_submodule = SubModule(self.weight)
                self.weight_submodule2 = SubModule(self.weight)
                self.weight_submodule3 = SubModule(self.weight)
                self.weight_submodule4 = SubModule(self.weight)

            def forward(self, x):
                a = torch.nn.functional.linear(self.weight_submodule(x), self.weight)
                b = torch.nn.functional.linear(self.weight_submodule2(x), self.weight)
                c = torch.nn.functional.linear(self.weight_submodule3(x), self.weight)
                d = torch.nn.functional.linear(self.weight_submodule4(x), self.weight)
                return a + b + c + d

        class ModelWithSubmodules(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.compute = LinearModuleAndSubModule(5000, 5000)
                self.compute1 = LinearModuleAndSubModule(5000, 5000)

            def forward(self, x):
                a = self.compute(x)
                b = self.compute1(x)
                return a + b

        # We should need only 2 * 5000 * 5000 * 32 // 8 * 1e-6 = 200 MB on the device 0 for the whole model forward, and not 600 MB.
        device_map = {"compute": torch_device, "compute1": "cpu"}

        model = ModelWithSubmodules()

        x = torch.randn(1, 5000)
        with torch.no_grad():
            expected = model(x)

        # Just to initialize accelerator context.
        a = torch.rand(5).to(torch_device)  # noqa: F841

        free_memory_bytes = torch_accelerator_module.mem_get_info(torch_device)[0]
        required_memory_bytes = 2 * 5000 * 5000 * (32 // 8)  # 200 MB

        # Leaving 150 MB of free memory for possible buffers, etc.
        n_vals = (free_memory_bytes - required_memory_bytes - int(150e6)) // (32 // 8)
        foo = torch.rand(n_vals, device=torch_device)  # noqa: F841

        free_memory_bytes_before_dispatch = torch_accelerator_module.mem_get_info(torch_device)[0]
        dispatch_model(model, device_map)
        free_memory_bytes_after_dispatch = torch_accelerator_module.mem_get_info(torch_device)[0]

        assert (free_memory_bytes_after_dispatch - free_memory_bytes_before_dispatch) * 1e-6 < 130

        original_pointer = model.compute1._hf_hook.weights_map["weight"].data_ptr()

        oom_error = (
            torch.OutOfMemoryError if is_torch_version(">=", "2.5.0") else torch_accelerator_module.OutOfMemoryError
        )
        with torch.no_grad():
            try:
                output = model(x)
            except oom_error as e:
                raise oom_error(
                    f"OOM error in dispatch_model. This is a bug and should not happen, see test_dispatch_model_tied_weights_memory_with_nested_offload_cpu. {e}"
                )
            except Exception as e:
                raise e

        torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

        clear_device_cache()

        free_memory_bytes_after_infer = torch_accelerator_module.mem_get_info(torch_device)[0]

        # Check that we have no more references on GPU for the offloaded tied weight.
        assert len(model.compute1.weight_submodule._hf_hook.tied_params_map[original_pointer]) == 0
        assert len(model.compute1._hf_hook.tied_params_map[original_pointer]) == 0
        assert (free_memory_bytes_after_infer - free_memory_bytes_after_dispatch) * 1e-6 < 130

        # Test is flacky otherwise.
        del model
        gc.collect()

    # This test fails because sometimes data_ptr() of compute2.weight is the same as compute1.weight.
    # I checked that the values are not the same but it gives the same address. This does not happen on my local machine.
    @require_cuda_or_xpu
    @unittest.skip(
        "Flaky test, we should have enough coverage with test_dispatch_model_tied_weights_memory_with_nested_offload_cpu test"
    )
    def test_dispatch_model_tied_weights_memory_with_nested_offload_disk(self):
        # Test that we do not duplicate tied weights at any point during dispatch_model call.

        torch_accelerator_module = getattr(torch, torch_device_type)

        clear_device_cache()  # Needed in case we run several tests in a row.

        class SubModule(torch.nn.Module):
            def __init__(self, ref_to_parameter):
                super().__init__()
                self.parameter = ref_to_parameter

            def forward(self, x):
                return x + torch.max(self.parameter)

        class LinearModuleAndSubModule(torch.nn.Linear):
            def __init__(self, in_features, out_features):
                super().__init__(in_features, out_features, bias=False)
                self.weight_submodule = SubModule(self.weight)
                self.weight_submodule2 = SubModule(self.weight)
                self.weight_submodule3 = SubModule(self.weight)
                self.weight_submodule4 = SubModule(self.weight)

            def forward(self, x):
                a = torch.nn.functional.linear(self.weight_submodule(x), self.weight)
                b = torch.nn.functional.linear(self.weight_submodule2(x), self.weight)
                c = torch.nn.functional.linear(self.weight_submodule3(x), self.weight)
                d = torch.nn.functional.linear(self.weight_submodule4(x), self.weight)
                return a + b + c + d

        class ModelWithSubmodules(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.compute = LinearModuleAndSubModule(5000, 5000)
                self.compute1 = LinearModuleAndSubModule(5000, 5000)

            def forward(self, x):
                a = self.compute(x)
                b = self.compute1(x)
                return a + b

        # We should need only 2 * 5000 * 5000 * 32 // 8 * 1e-6 = 200 MB on the device 0 for the whole model forward, and not 600 MB.
        device_map = {"compute": 0, "compute1": "disk"}

        model = ModelWithSubmodules()

        x = torch.randn(1, 5000)
        with torch.no_grad():
            expected = model(x)

        # Just to initialize CUDA context.
        device_0 = f"{torch_device_type}:0"
        a = torch.rand(5).to(device_0)  # noqa: F841

        free_memory_bytes = torch_accelerator_module.mem_get_info(device_0)[0]
        required_memory_bytes = 2 * 5000 * 5000 * (32 // 8)  # 200 MB

        # Leaving 150 MB of free memory for possible buffers, etc.
        n_vals = (free_memory_bytes - required_memory_bytes - int(200e6)) // (32 // 8)
        foo = torch.rand(n_vals, device=device_0)  # noqa: F841

        free_memory_bytes_before_dispatch = torch_accelerator_module.mem_get_info(device_0)[0]
        with TemporaryDirectory() as tmp_dir:
            dispatch_model(model, device_map, offload_dir=tmp_dir)
            free_memory_bytes_after_dispatch = torch_accelerator_module.mem_get_info(device_0)[0]

            assert (free_memory_bytes_after_dispatch - free_memory_bytes_before_dispatch) * 1e-6 < 130

            oom_error = (
                torch.OutOfMemoryError
                if hasattr(torch, "OutOfMemoryError")
                else torch_accelerator_module.OutOfMemoryError
            )
            with torch.no_grad():
                try:
                    output = model(x)
                except oom_error as e:
                    raise oom_error(
                        f"OOM error in dispatch_model. This is a bug and should not happen, see test_dispatch_model_tied_weights_memory_with_nested_offload_disk. {e}"
                    )
                except Exception as e:
                    raise e

            torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

            clear_device_cache()

            free_memory_bytes_after_infer = torch_accelerator_module.mem_get_info(device_0)[0]

            # Check that we have no more references on GPU for the offloaded tied weight.
            n_non_empty = 0
            for pointer, pointer_dict in model.compute1.weight_submodule._hf_hook.tied_params_map.items():
                if len(pointer_dict) > 0:
                    n_non_empty += 1
            assert n_non_empty == 1  # `compute` layer one.

            n_non_empty = 0
            for pointer, pointer_dict in model.compute1._hf_hook.tied_params_map.items():
                if len(pointer_dict) > 0:
                    n_non_empty += 1
            assert n_non_empty == 1  # `compute` layer one.

            assert (free_memory_bytes_after_infer - free_memory_bytes_after_dispatch) * 1e-6 < 130

    @require_non_hpu  # hpu does not support device indexing "hpu:1"
    @require_multi_device
    def test_dispatch_model_multi_devices(self):
        model = BiggerModelForTest()

        device_map = {"linear1": "cpu", "linear2": "disk", "batchnorm": "cpu", "linear3": 0, "linear4": 1}

        x = torch.randn(2, 3)
        expected = model(x)

        with TemporaryDirectory() as tmp_dir:
            dispatch_model(model, device_map, offload_dir=tmp_dir)
            output = model(x)
            torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

    @require_non_cpu
    def test_dispatch_model_copy(self):
        original_model = ModelForTestCopy(id=1)
        device_map = {"linear1": 0, "batchnorm": "cpu", "linear2": 0}

        x = torch.randn(2, 3)
        expected, original_output_id = original_model(x)

        dispatch_model(original_model, device_map)

        copied_model = copy.deepcopy(original_model)
        copied_model.id = 2
        output, copied_output_id = copied_model(x)

        assert original_model.id == original_output_id
        assert copied_model.id == copied_output_id
        assert copied_model.linear1.forward is not original_model.linear1.forward
        torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

    @require_non_cpu
    def test_dispatch_model_move_offloaded_model(self):
        model = ModelForTest()
        device_map = {"linear1": "disk", "batchnorm": "cpu", "linear2": 0}
        with TemporaryDirectory() as tmp_dir:
            dispatch_model(model, device_map, offload_dir=tmp_dir)
            with self.assertRaises(RuntimeError):
                model.to(0)

    @require_non_hpu  # hpu does not support device indexing "hpu:1"
    @require_multi_device
    def test_dispatch_model_move_model_warning(self):
        model = ModelForTest()
        device_map = {"linear1": 0, "batchnorm": 0, "linear2": 1}
        with TemporaryDirectory() as tmp_dir:
            dispatch_model(model, device_map, offload_dir=tmp_dir)
            with self.assertLogs("accelerate.big_modeling", level="WARNING"):
                model.to("cpu")
            with self.assertLogs("accelerate.big_modeling", level="WARNING"):
                model.to(torch_device)
            with self.assertRaises(RuntimeError):
                x = torch.randn(2, 3)
                model(x)

    @slow
    @require_non_hpu  # hpu does not support device indexing "hpu:1"
    @require_multi_device
    def test_dispatch_model_gpt2_on_two_devices(self):
        tokenizer = AutoTokenizer.from_pretrained("gpt2")
        inputs = tokenizer("Hello world! My name is", return_tensors="pt").to(torch_device)

        gpt2 = AutoModelForCausalLM.from_pretrained("gpt2")
        # Dispatch on GPUs 0 and 1
        device_map = {
            "transformer.wte": 0,
            "transformer.wpe": 0,
            "transformer.ln_f": 1,
            "lm_head": 0,
        }
        for i in range(12):
            device_map[f"transformer.h.{i}"] = 0 if i <= 5 else 1

        gpt2 = dispatch_model(gpt2, device_map)
        outputs = gpt2.generate(inputs["input_ids"], max_new_tokens=10)
        assert tokenizer.decode(outputs[0].tolist()) == "Hello world! My name is Kiyoshi, and I'm a student at"

        # Dispatch with a bit of CPU offload
        gpt2 = AutoModelForCausalLM.from_pretrained("gpt2")
        for i in range(4):
            device_map[f"transformer.h.{i}"] = "cpu"
        gpt2 = dispatch_model(gpt2, device_map)
        outputs = gpt2.generate(inputs["input_ids"], max_new_tokens=10)
        assert tokenizer.decode(outputs[0].tolist()) == "Hello world! My name is Kiyoshi, and I'm a student at"
        # Dispatch with a bit of CPU and disk offload
        gpt2 = AutoModelForCausalLM.from_pretrained("gpt2")
        for i in range(2):
            device_map[f"transformer.h.{i}"] = "disk"

        with TemporaryDirectory() as tmp_dir:
            state_dict = {
                k: p for k, p in gpt2.state_dict().items() if "transformer.h.0" in k or "transformer.h.1" in k
            }
            offload_state_dict(tmp_dir, state_dict)
            gpt2 = dispatch_model(gpt2, device_map, offload_dir=tmp_dir)
            outputs = gpt2.generate(inputs["input_ids"], max_new_tokens=10)
            assert tokenizer.decode(outputs[0].tolist()) == "Hello world! My name is Kiyoshi, and I'm a student at"

    @require_non_cpu
    def test_dispatch_model_with_unused_submodules(self):
        model = ModelWithUnusedSubModulesForTest()
        device_map = {"linear1": "cpu", "linear2": "disk", "batchnorm": "cpu", "linear3": 0, "linear4": 0}

        x = torch.randn(2, 3)
        expected = model(x)

        with TemporaryDirectory() as tmp_dir:
            dispatch_model(
                model, device_map, offload_dir=tmp_dir, preload_module_classes=["ModuleWithUnusedSubModules"]
            )
            output = model(x)
            torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

    @require_non_hpu  # hpu does not support device indexing "hpu:1"
    @require_multi_device
    def test_dispatch_model_with_unused_submodules_multi_device(self):
        model = ModelWithUnusedSubModulesForTest()

        device_map = {"linear1": "cpu", "linear2": "disk", "batchnorm": "cpu", "linear3": 0, "linear4": 1}

        x = torch.randn(2, 3)
        expected = model(x)

        with TemporaryDirectory() as tmp_dir:
            dispatch_model(
                model, device_map, offload_dir=tmp_dir, preload_module_classes=["ModuleWithUnusedSubModules"]
            )
            output = model(x)
            torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

    @require_non_cpu
    def test_dispatch_model_force_hooks(self):
        model = ModelForTest()
        device_map = {"": 0}

        x = torch.randn(2, 3)
        expected = model(x)

        dispatch_model(model, device_map, force_hooks=True)
        output = model(x)
        torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

    @require_non_cpu
    def test_load_checkpoint_and_dispatch(self):
        model = ModelForTest()
        device_map = {"linear1": "cpu", "batchnorm": "cpu", "linear2": 0}

        x = torch.randn(2, 3)
        expected = model(x)

        with TemporaryDirectory() as tmp_dir:
            checkpoint = os.path.join(tmp_dir, "pt_model.bin")
            torch.save(model.state_dict(), checkpoint)

            new_model = ModelForTest()
            new_model = load_checkpoint_and_dispatch(new_model, checkpoint, device_map=device_map)

        # CPU-offloaded weights are on the meta device while waiting for the forward pass.
        assert new_model.linear1.weight.device == torch.device("meta")
        assert new_model.linear2.weight.device == torch.device(torch_device)

        output = new_model(x)
        torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

    def test_load_checkpoint_and_dispatch_device_map_none(self):
        model = ModelForTest()

        with TemporaryDirectory() as tmp_dir:
            checkpoint = os.path.join(tmp_dir, "pt_model.bin")
            torch.save(model.state_dict(), checkpoint)

            new_model = ModelForTest()
            new_model = load_checkpoint_and_dispatch(new_model, checkpoint, device_map=None)

        for (name, tensor), (new_name, new_tensor) in zip(
            itertools.chain(model.named_parameters(), model.named_buffers()),
            itertools.chain(new_model.named_parameters(), new_model.named_buffers()),
        ):
            assert name == new_name
            torch.testing.assert_close(tensor, new_tensor, msg=new_name)

    @require_non_hpu  # hpu does not support device indexing "hpu:1"
    @require_multi_device
    def test_load_checkpoint_and_dispatch_multi_device(self):
        model = BiggerModelForTest()

        device_map = {"linear1": "cpu", "linear2": "cpu", "batchnorm": 0, "linear3": 0, "linear4": 1}

        x = torch.randn(2, 3)
        expected = model(x)

        with TemporaryDirectory() as tmp_dir:
            checkpoint = os.path.join(tmp_dir, "pt_model.bin")
            torch.save(model.state_dict(), checkpoint)

            new_model = BiggerModelForTest()
            new_model = load_checkpoint_and_dispatch(new_model, checkpoint, device_map=device_map)

        # CPU-offloaded weights are on the meta device while waiting for the forward pass.
        assert new_model.linear1.weight.device == torch.device("meta")
        assert new_model.linear2.weight.device == torch.device("meta")
        assert new_model.linear3.weight.device == torch.device(torch_device)
        assert new_model.linear4.weight.device == torch.device(torch_device.replace(":0", ":1"))

        output = new_model(x)
        torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

    @require_non_cpu
    def test_load_checkpoint_and_dispatch_with_unused_submodules(self):
        model = ModelWithUnusedSubModulesForTest()
        device_map = {"linear1": "cpu", "linear2": "cpu", "batchnorm": 0, "linear3": 0, "linear4": 0}

        x = torch.randn(2, 3)
        expected = model(x)

        with TemporaryDirectory() as tmp_dir:
            checkpoint = os.path.join(tmp_dir, "pt_model.bin")
            torch.save(model.state_dict(), checkpoint)

            new_model = ModelWithUnusedSubModulesForTest()
            new_model = load_checkpoint_and_dispatch(
                new_model, checkpoint, device_map=device_map, preload_module_classes=["ModuleWithUnusedSubModules"]
            )

        # CPU-offloaded weights are on the meta device while waiting for the forward pass.
        assert new_model.linear1.linear.weight.device == torch.device("meta")
        assert new_model.linear2.linear.weight.device == torch.device("meta")
        assert new_model.linear3.linear.weight.device == torch.device(torch_device)
        assert new_model.linear4.linear.weight.device == torch.device(torch_device)

        output = new_model(x)
        torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

    @require_non_hpu  # hpu does not support device indexing "hpu:1"
    @require_multi_device
    def test_load_checkpoint_and_dispatch_multi_device_with_unused_submodules(self):
        model = ModelWithUnusedSubModulesForTest()

        device_map = {"linear1": "cpu", "linear2": "cpu", "batchnorm": 0, "linear3": 0, "linear4": 1}

        x = torch.randn(2, 3)
        expected = model(x)

        with TemporaryDirectory() as tmp_dir:
            checkpoint = os.path.join(tmp_dir, "pt_model.bin")
            torch.save(model.state_dict(), checkpoint)

            new_model = ModelWithUnusedSubModulesForTest()
            new_model = load_checkpoint_and_dispatch(
                new_model, checkpoint, device_map=device_map, preload_module_classes=["ModuleWithUnusedSubModules"]
            )

        # CPU-offloaded weights are on the meta device while waiting for the forward pass.
        assert new_model.linear1.linear.weight.device == torch.device("meta")
        assert new_model.linear2.linear.weight.device == torch.device("meta")
        assert new_model.linear3.linear.weight.device == torch.device(torch_device)
        assert new_model.linear4.linear.weight.device == torch.device(torch_device.replace(":0", ":1"))

        output = new_model(x)
        torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)

    @require_non_cpu
    def test_cpu_offload_with_hook(self):
        model1 = torch.nn.Linear(4, 5)
        model1, hook1 = cpu_offload_with_hook(model1)
        assert model1.weight.device == torch.device("cpu")

        inputs = torch.randn(3, 4)
        outputs = model1(inputs)
        assert outputs.device == torch.device(torch_device)
        assert model1.weight.device == torch.device(torch_device)

        hook1.offload()
        assert model1.weight.device == torch.device("cpu")

        model2 = torch.nn.Linear(5, 5)
        model2, hook2 = cpu_offload_with_hook(model2, prev_module_hook=hook1)
        assert model2.weight.device == torch.device("cpu")

        outputs = model1(inputs)
        assert outputs.device == torch.device(torch_device)
        assert model1.weight.device == torch.device(torch_device)

        outputs = model2(outputs)
        assert outputs.device == torch.device(torch_device)
        assert model1.weight.device == torch.device("cpu")
        assert model2.weight.device == torch.device(torch_device)

        hook2.offload()
        assert model2.weight.device == torch.device("cpu")

    @slow
    @require_bnb
    @require_non_hpu  # bnb is not supported on hpu
    @require_non_torch_xla
    @require_multi_device
    def test_dispatch_model_bnb(self):
        """Tests that `dispatch_model` quantizes int8 layers"""
        from huggingface_hub import hf_hub_download
        from transformers import AutoConfig, AutoModel, BitsAndBytesConfig
        from transformers.integrations.bitsandbytes import replace_with_bnb_linear

        with init_empty_weights():
            model = AutoModel.from_config(AutoConfig.from_pretrained("bigscience/bloom-560m"))

        quantization_config = BitsAndBytesConfig(load_in_8bit=True)
        model = replace_with_bnb_linear(
            model, modules_to_not_convert=["lm_head"], quantization_config=quantization_config
        )

        model_path = hf_hub_download("bigscience/bloom-560m", "pytorch_model.bin")

        model = load_checkpoint_and_dispatch(
            model,
            checkpoint=model_path,
            device_map="balanced",
        )

        assert model.h[0].self_attention.query_key_value.weight.dtype == torch.int8
        assert model.h[0].self_attention.query_key_value.weight.device.index == 0

        assert model.h[(-1)].self_attention.query_key_value.weight.dtype == torch.int8
        assert model.h[(-1)].self_attention.query_key_value.weight.device.index == 1

    @require_cuda_or_xpu
    @slow
    @require_bnb
    def test_dispatch_model_int8_simple(self):
        """Tests that `dispatch_model` quantizes int8 layers"""
        from huggingface_hub import hf_hub_download
        from transformers import AutoConfig, AutoModel, BitsAndBytesConfig
        from transformers.integrations.bitsandbytes import replace_with_bnb_linear

        with init_empty_weights():
            model = AutoModel.from_config(AutoConfig.from_pretrained("bigscience/bloom-560m"))

        quantization_config = BitsAndBytesConfig(load_in_8bit=True)
        model = replace_with_bnb_linear(
            model, modules_to_not_convert=["lm_head"], quantization_config=quantization_config
        )

        model_path = hf_hub_download("bigscience/bloom-560m", "pytorch_model.bin")

        # test with auto
        model = load_checkpoint_and_dispatch(
            model,
            checkpoint=model_path,
            device_map="auto",
        )

        assert model.h[0].self_attention.query_key_value.weight.dtype == torch.int8
        assert model.h[0].self_attention.query_key_value.weight.device.index == 0

        with init_empty_weights():
            model = AutoModel.from_config(AutoConfig.from_pretrained("bigscience/bloom-560m"))

        model = replace_with_bnb_linear(
            model, modules_to_not_convert=["lm_head"], quantization_config=quantization_config
        )

        # test with str device map
        model = load_checkpoint_and_dispatch(
            model,
            checkpoint=model_path,
            device_map={"": torch_device},
        )

        assert model.h[0].self_attention.query_key_value.weight.dtype == torch.int8
        assert model.h[0].self_attention.query_key_value.weight.device.index == 0

        with init_empty_weights():
            model = AutoModel.from_config(AutoConfig.from_pretrained("bigscience/bloom-560m"))

        model = replace_with_bnb_linear(
            model, modules_to_not_convert=["lm_head"], quantization_config=quantization_config
        )

        # test with torch.device device map
        model = load_checkpoint_and_dispatch(
            model,
            checkpoint=model_path,
            device_map={"": torch_device},
        )

        assert model.h[0].self_attention.query_key_value.weight.dtype == torch.int8
        assert model.h[0].self_attention.query_key_value.weight.device.index == 0

    @require_cuda_or_xpu
    @slow
    @require_bnb
    def test_dipatch_model_fp4_simple(self):
        """Tests that `dispatch_model` quantizes fp4 layers"""
        from huggingface_hub import hf_hub_download
        from transformers import AutoConfig, AutoModel, BitsAndBytesConfig
        from transformers.integrations.bitsandbytes import replace_with_bnb_linear

        with init_empty_weights():
            model = AutoModel.from_config(AutoConfig.from_pretrained("bigscience/bloom-560m"))

        quantization_config = BitsAndBytesConfig(load_in_4bit=True)

        model = replace_with_bnb_linear(
            model, modules_to_not_convert=["lm_head"], quantization_config=quantization_config
        )

        model_path = hf_hub_download("bigscience/bloom-560m", "pytorch_model.bin")

        # test with auto
        model = load_checkpoint_and_dispatch(
            model,
            checkpoint=model_path,
            device_map="auto",
        )

        assert model.h[0].self_attention.query_key_value.weight.dtype == torch.uint8
        assert model.h[0].self_attention.query_key_value.weight.device.index == 0

        with init_empty_weights():
            model = AutoModel.from_config(AutoConfig.from_pretrained("bigscience/bloom-560m"))

        model = replace_with_bnb_linear(
            model, modules_to_not_convert=["lm_head"], quantization_config=quantization_config
        )

        # test with str device map
        model = load_checkpoint_and_dispatch(
            model,
            checkpoint=model_path,
            device_map={"": torch_device},
        )

        assert model.h[0].self_attention.query_key_value.weight.dtype == torch.uint8
        assert model.h[0].self_attention.query_key_value.weight.device.index == 0

        with init_empty_weights():
            model = AutoModel.from_config(AutoConfig.from_pretrained("bigscience/bloom-560m"))

        model = replace_with_bnb_linear(
            model, modules_to_not_convert=["lm_head"], quantization_config=quantization_config
        )

        # test with torch.device device map
        model = load_checkpoint_and_dispatch(
            model,
            checkpoint=model_path,
            device_map={"": torch_device},
        )

        assert model.h[0].self_attention.query_key_value.weight.dtype == torch.uint8
        assert model.h[0].self_attention.query_key_value.weight.device.index == 0


================================================
FILE: tests/test_cli.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest
from pathlib import Path
from unittest.mock import patch

import torch
from huggingface_hub.utils import GatedRepoError

import accelerate.commands.test as accelerate_test_cmd
from accelerate.commands.config.config_args import BaseConfig, ClusterConfig, SageMakerConfig, load_config_from_file
from accelerate.commands.estimate import estimate_command, estimate_command_parser, gather_data
from accelerate.commands.launch import _validate_launch_command, launch_command, launch_command_parser
from accelerate.commands.to_fsdp2 import to_fsdp2_command, to_fsdp2_command_parser
from accelerate.commands.tpu import tpu_command_launcher, tpu_command_parser
from accelerate.test_utils.testing import (
    capture_call_output,
    path_in_accelerate_package,
    require_multi_device,
    require_non_hpu,
    require_timm,
    require_transformers,
    run_command,
    run_first,
)
from accelerate.utils import patch_environment
from accelerate.utils.launch import prepare_simple_launcher_cmd_env


class AccelerateLauncherTester(unittest.TestCase):
    """
    Test case for verifying the `accelerate launch` CLI operates correctly.
    If a `default_config.yaml` file is located in the cache it will temporarily move it
    for the duration of the tests.
    """

    test_file_path = path_in_accelerate_package("test_utils", "scripts", "test_cli.py")
    notebook_launcher_path = path_in_accelerate_package("test_utils", "scripts", "test_notebook.py")

    config_folder = Path.home() / ".cache/huggingface/accelerate"
    config_file = "default_config.yaml"
    config_path = config_folder / config_file
    changed_path = config_folder / "_default_config.yaml"

    test_config_path = Path("tests/test_configs")
    parser = launch_command_parser()

    @classmethod
    def setUpClass(cls):
        if cls.config_path.is_file():
            cls.config_path.rename(cls.changed_path)

    @classmethod
    def tearDownClass(cls):
        if cls.changed_path.is_file():
            cls.changed_path.rename(cls.config_path)

    @run_first
    def test_no_config(self):
        args = ["--monitor_interval", "0.1", str(self.test_file_path)]
        if torch.cuda.is_available() and (torch.cuda.device_count() > 1):
            args = ["--multi_gpu"] + args
        elif torch.xpu.is_available() and (torch.xpu.device_count() > 1):
            args = ["--multi_gpu"] + args
        args = self.parser.parse_args(["--monitor_interval", "0.1", str(self.test_file_path)])
        launch_command(args)

    @run_first
    def test_config_compatibility(self):
        invalid_configs = ["fp8", "invalid", "mpi", "sagemaker"]
        for config in sorted(self.test_config_path.glob("**/*.yaml")):
            if any(invalid_config in str(config) for invalid_config in invalid_configs):
                continue
            with self.subTest(config_file=config):
                args = self.parser.parse_args(["--config_file", str(config), str(self.test_file_path)])
                launch_command(args)

    @run_first
    def test_invalid_keys(self):
        config_path = self.test_config_path / "invalid_keys.yaml"
        with self.assertRaises(
            ValueError,
            msg="The config file at 'invalid_keys.yaml' had unknown keys ('another_invalid_key', 'invalid_key')",
        ):
            args = self.parser.parse_args(["--config_file", str(config_path), str(self.test_file_path)])
            launch_command(args)

    @run_first
    def test_accelerate_test(self):
        args = accelerate_test_cmd.test_command_parser().parse_args([])
        accelerate_test_cmd.test_command(args)

    @run_first
    @require_non_hpu
    @require_multi_device
    def test_notebook_launcher(self):
        """
        This test checks a variety of situations and scenarios
        with the `notebook_launcher`
        """
        cmd = ["python", self.notebook_launcher_path]
        with patch_environment(omp_num_threads=1, accelerate_num_processes=2):
            run_command(cmd)

    def test_mpi_multicpu_config_cmd(self):
        """
        Parses a launch command with a test file and the 0_28_0_mpi.yaml config. Tests getting the command and
        environment vars and verifies the mpirun command arg values.
        """
        mpi_config_path = str(self.test_config_path / "0_28_0_mpi.yaml")
        test_file_arg = "--cpu"

        with patch("sys.argv", ["accelerate", str(self.test_file_path), test_file_arg]):
            parser = launch_command_parser()
            args = parser.parse_args()
        args.config_file = mpi_config_path
        args, _, _ = _validate_launch_command(args)

        # Mock out the check for mpirun version to simulate Intel MPI
        with patch("accelerate.utils.launch.which", return_value=True):
            with patch("accelerate.utils.launch.subprocess.check_output", return_value=b"Intel MPI"):
                cmd, _ = prepare_simple_launcher_cmd_env(args)

        # Verify the mpirun command args
        expected_mpirun_cmd = ["mpirun", "-f", "/home/user/hostfile", "-ppn", "4", "-n", "16"]
        self.assertGreater(len(cmd), len(expected_mpirun_cmd))
        generated_mpirun_cmd = cmd[0 : len(expected_mpirun_cmd)]
        self.assertEqual(expected_mpirun_cmd, generated_mpirun_cmd)

        # Verify the python script and args in the mpirun command
        python_script_cmd = cmd[len(expected_mpirun_cmd) :]
        self.assertEqual(len(python_script_cmd), 3)
        self.assertEqual(python_script_cmd[1], str(self.test_file_path))
        self.assertEqual(python_script_cmd[2], test_file_arg)

    def test_validate_launch_command(self):
        """Test that the validation function combines args and defaults."""
        parser = launch_command_parser()
        args = parser.parse_args(
            [
                "--num-processes",
                "2",
                "--deepspeed_config_file",
                "path/to/be/accepted",
                "--config-file",
                str(self.test_config_path / "validate_launch_cmd.yaml"),
                "test.py",
            ]
        )
        self.assertFalse(args.debug)
        self.assertTrue(args.fsdp_sync_module_states)
        _validate_launch_command(args)
        self.assertTrue(args.debug)
        self.assertEqual(2, args.num_processes)
        self.assertFalse(args.fsdp_sync_module_states)
        self.assertEqual("path/to/be/accepted", args.deepspeed_config_file)


class LaunchArgTester(unittest.TestCase):
    """
    Test cases revolving around the CLI wrappers
    """

    parser = launch_command_parser()

    def test_hyphen(self):
        # Try a little from each cluster
        args = ["--config-file", "test.yaml", "test.py"]
        result = self.parser.parse_args(args)
        assert result.config_file == "test.yaml"
        assert result.multi_gpu is False

        args = ["--multi-gpu", "--num-processes", "4", "test.py"]
        result = self.parser.parse_args(args)
        assert result.multi_gpu is True
        assert result.num_processes == 4
        # And use a mix
        args = ["--multi-gpu", "--use-deepspeed", "--use-fsdp", "--num_processes", "4", "test.py"]
        result = self.parser.parse_args(args)
        assert result.multi_gpu is True
        assert result.use_deepspeed is True
        assert result.use_fsdp is True
        assert result.num_processes == 4

    def test_underscore(self):
        # Try a little from each cluster
        args = ["--config_file", "test.yaml", "test.py"]
        result = self.parser.parse_args(args)
        assert result.config_file == "test.yaml"

        args = ["--multi_gpu", "--num_processes", "4", "test.py"]
        result = self.parser.parse_args(args)
        assert result.multi_gpu is True
        assert result.num_processes == 4
        # And use a mix
        args = ["--multi_gpu", "--use_deepspeed", "--use_fsdp", "--num-processes", "4", "test.py"]
        result = self.parser.parse_args(args)
        assert result.multi_gpu is True
        assert result.use_deepspeed is True
        assert result.use_fsdp is True
        assert result.num_processes == 4

    def test_duplicate_entities(self):
        help_return = self.parser.format_help()
        args = self.parser.parse_args(["test.py"])
        for arg in args.__dict__:
            if "_" in arg:
                bad_arg = f"--{arg.replace('_', '-')}"
                # Need an exception for `num-processes` since it's in the docstring
                if bad_arg == "--num-processes":
                    assert help_return.count(bad_arg) == 1, f"Found {bad_arg} in `accelerate launch -h`"
                else:
                    assert bad_arg not in help_return, f"Found {bad_arg} in `accelerate launch -h`"


class ClusterConfigTester(unittest.TestCase):
    """
    Test case for verifying the config dataclasses work
    """

    test_config_path = Path("tests/test_configs")

    def test_base_config(self):
        # Tests that all the dataclasses can be initialized
        config = BaseConfig(
            compute_environment="LOCAL_MACHINE",
            distributed_type="NO",
            mixed_precision="fp16",
            debug=False,
            use_cpu=False,
        )

        assert config.compute_environment == "LOCAL_MACHINE"
        assert config.distributed_type == "NO"
        assert config.mixed_precision == "fp16"
        assert config.debug is False

    def test_cluster_config(self):
        # First normally
        config = ClusterConfig(
            compute_environment="LOCAL_MACHINE",
            distributed_type="NO",
            mixed_precision="fp16",
            num_processes=2,
            debug=False,
            use_cpu=False,
        )

        assert config.compute_environment == "LOCAL_MACHINE"
        assert config.distributed_type == "NO"
        assert config.mixed_precision == "fp16"
        assert config.debug is False

        # Then check with other compute environments
        config = ClusterConfig(
            compute_environment="LOCAL_MACHINE",
            distributed_type="MULTI_GPU",
            mixed_precision="fp16",
            debug=False,
            num_processes=2,
            enable_cpu_affinity=True,
            use_cpu=False,
        )

        assert config.distributed_type == "MULTI_GPU"
        assert config.num_processes == 2
        assert config.enable_cpu_affinity is True

    def test_sagemaker_config(self):
        config = SageMakerConfig(
            compute_environment="AMAZON_SAGEMAKER",
            distributed_type="NO",
            mixed_precision="fp16",
            debug=False,
            use_cpu=False,
            ec2_instance_type="MY_TYPE",
            iam_role_name="MY_ROLE",
        )

        assert config.compute_environment == "AMAZON_SAGEMAKER"
        assert config.ec2_instance_type == "MY_TYPE"
        assert config.iam_role_name == "MY_ROLE"

        config = load_config_from_file(str(self.test_config_path / "0_30_0_sagemaker.yaml"))


class TpuConfigTester(unittest.TestCase):
    """
    Test case for verifying the `accelerate tpu-config` CLI passes the right `gcloud` command.
    """

    tpu_name = "test-tpu"
    tpu_zone = "us-central1-a"
    command = "ls"
    cmd = ["accelerate", "tpu-config"]
    base_output = "cd /usr/share"
    command_file = "tests/test_samples/test_command_file.sh"
    gcloud = "Running gcloud compute tpus tpu-vm ssh"

    def setUp(self):
        self.parser = tpu_command_parser()

    def test_base(self):
        args = self.parser.parse_args(
            ["--command", self.command, "--tpu_zone", self.tpu_zone, "--tpu_name", self.tpu_name, "--debug"]
        )
        output = capture_call_output(tpu_command_launcher, args)
        assert f"{self.gcloud} test-tpu --zone us-central1-a --command {self.base_output}; ls --worker all" in output

    def test_base_backward_compatibility(self):
        args = self.parser.parse_args(
            [
                "--config_file",
                "tests/test_configs/0_12_0.yaml",
                "--command",
                self.command,
                "--tpu_zone",
                self.tpu_zone,
                "--tpu_name",
                self.tpu_name,
                "--debug",
            ]
        )
        output = capture_call_output(tpu_command_launcher, args)
        assert f"{self.gcloud} test-tpu --zone us-central1-a --command {self.base_output}; ls --worker all" in output

    def test_with_config_file(self):
        args = self.parser.parse_args(["--config_file", "tests/test_configs/latest.yaml", "--debug"])
        output = capture_call_output(tpu_command_launcher, args)
        assert (
            f'{self.gcloud} test-tpu --zone us-central1-a --command {self.base_output}; echo "hello world"; echo "this is a second command" --worker all'
            in output
        )

    def test_with_config_file_and_command(self):
        args = self.parser.parse_args(
            ["--config_file", "tests/test_configs/latest.yaml", "--command", self.command, "--debug"]
        )
        output = capture_call_output(tpu_command_launcher, args)
        assert f"{self.gcloud} test-tpu --zone us-central1-a --command {self.base_output}; ls --worker all" in output

    def test_with_config_file_and_multiple_command(self):
        args = self.parser.parse_args(
            [
                "--config_file",
                "tests/test_configs/latest.yaml",
                "--command",
                self.command,
                "--command",
                'echo "Hello World"',
                "--debug",
            ]
        )
        output = capture_call_output(tpu_command_launcher, args)
        assert (
            f'{self.gcloud} test-tpu --zone us-central1-a --command {self.base_output}; ls; echo "Hello World" --worker all'
            in output
        )

    def test_with_config_file_and_command_file(self):
        args = self.parser.parse_args(
            ["--config_file", "tests/test_configs/latest.yaml", "--command_file", self.command_file, "--debug"]
        )
        output = capture_call_output(tpu_command_launcher, args)
        assert (
            f'{self.gcloud} test-tpu --zone us-central1-a --command {self.base_output}; echo "hello world"; echo "this is a second command" --worker all'
            in output
        )

    def test_with_config_file_and_command_file_backward_compatibility(self):
        args = self.parser.parse_args(
            [
                "--config_file",
                "tests/test_configs/0_12_0.yaml",
                "--command_file",
                self.command_file,
                "--tpu_zone",
                self.tpu_zone,
                "--tpu_name",
                self.tpu_name,
                "--debug",
            ]
        )
        output = capture_call_output(tpu_command_launcher, args)
        assert (
            f'{self.gcloud} test-tpu --zone us-central1-a --command {self.base_output}; echo "hello world"; echo "this is a second command" --worker all'
            in output
        )

    def test_accelerate_install(self):
        args = self.parser.parse_args(
            ["--config_file", "tests/test_configs/latest.yaml", "--install_accelerate", "--debug"]
        )
        output = capture_call_output(tpu_command_launcher, args)
        assert (
            f'{self.gcloud} test-tpu --zone us-central1-a --command {self.base_output}; pip install accelerate -U; echo "hello world"; echo "this is a second command" --worker all'
            in output
        )

    def test_accelerate_install_version(self):
        args = self.parser.parse_args(
            [
                "--config_file",
                "tests/test_configs/latest.yaml",
                "--install_accelerate",
                "--accelerate_version",
                "12.0.0",
                "--debug",
            ]
        )
        output = capture_call_output(tpu_command_launcher, args)
        assert (
            f'{self.gcloud} test-tpu --zone us-central1-a --command {self.base_output}; pip install accelerate==12.0.0; echo "hello world"; echo "this is a second command" --worker all'
            in output
        )


class ModelEstimatorTester(unittest.TestCase):
    """
    Test case for checking the output of `accelerate estimate-memory` is correct.

    - Uses `estimate_command` when trying to catch raised errors
    - Uses `gather_data` when just verifying the calculations are correct
    """

    parser = estimate_command_parser()

    def test_invalid_model_name(self):
        with self.assertRaises(OSError, msg="Repo for model `somebrokenname` does not exist on the Hub"):
            args = self.parser.parse_args(["somebrokenname"])
            estimate_command(args)

    @require_timm
    def test_invalid_model_name_timm(self):
        with self.assertRaises(RuntimeError, msg="Tried to load `muellerzr/dummy` with `timm` but"):
            args = self.parser.parse_args(["muellerzr/dummy", "--library_name", "timm"])
            estimate_command(args)

    @require_transformers
    def test_invalid_model_name_transformers(self):
        with self.assertRaises(RuntimeError, msg="Tried to load `muellerzr/dummy` with `transformers` but"):
            args = self.parser.parse_args(["muellerzr/dummy", "--library_name", "transformers"])
            estimate_command(args)

    def test_no_metadata(self):
        with self.assertRaises(
            ValueError, msg="Model `muellerzr/dummy` does not have any library metadata on the Hub"
        ):
            args = self.parser.parse_args(["muellerzr/dummy"])
            estimate_command(args)

    def test_gated(self):
        with self.assertRaises(
            (GatedRepoError, EnvironmentError),
            msg="Repo for model `meta-llama/Llama-2-7b-hf` is gated or environment error occurred",
        ):
            args = self.parser.parse_args(["meta-llama/Llama-2-7b-hf"])
            with patch_environment(hf_hub_disable_implicit_token="1"):
                estimate_command(args)

    @require_transformers
    def test_remote_code(self):
        # Also tests that custom `Auto` classes work
        args = self.parser.parse_args(["hf-internal-testing/test_dynamic_model"])
        with self.assertRaises(ValueError, msg="--trust_remote_code"):
            gather_data(args)

        # Verify it works with the flag
        args = self.parser.parse_args(["hf-internal-testing/test_dynamic_model", "--trust_remote_code"])
        gather_data(args)

    @require_transformers
    def test_explicit_dtypes(self):
        args = self.parser.parse_args(["bert-base-cased", "--dtypes", "float32", "float16"])
        output = gather_data(args)
        # The largest layer and total size of the model in bytes
        largest_layer, total_size = 90669056, 433249280
        # Check that full precision -> int4 is calculating correctly
        assert len(output) == 2, f"Output was missing a precision, expected 2 but received {len(output)}"

        for i, factor in enumerate([1, 2]):
            precision = 32 // factor
            precision_str = f"float{precision}"
            largest_layer_estimate = largest_layer / factor
            total_size_estimate = total_size / factor
            total_training_size_estimate = total_size_estimate * 4

            assert precision_str == output[i][0], f"Output is missing precision `{precision_str}`"
            assert largest_layer_estimate == output[i][1], (
                f"Calculation for largest layer size in `{precision_str}` is incorrect."
            )

            assert total_size_estimate == output[i][2], (
                f"Calculation for total size in `{precision_str}` is incorrect."
            )
            assert total_training_size_estimate == max(output[i][3].values()), (
                f"Calculation for total training size in `{precision_str}` is incorrect."
            )

    @require_transformers
    def test_transformers_model(self):
        args = self.parser.parse_args(["bert-base-cased", "--dtypes", "float32"])
        output = gather_data(args)
        # The largest layer and total size of the model in bytes
        largest_layer, total_size = 90669056, 433249280
        assert largest_layer == output[0][1], (
            f"Calculation for largest layer size in `fp32` is incorrect, expected {largest_layer} but received {output[0][1]}"
        )
        assert total_size == output[0][2], (
            f"Calculation for total size in `fp32` is incorrect, expected {total_size} but received {output[0][2]}"
        )

    @require_transformers
    def test_no_split_modules(self):
        # idefics-80b-instruct has ["IdeficsDecoderLayer", "IdeficsGatedCrossAttentionLayer"]
        args = self.parser.parse_args(["HuggingFaceM4/idefics-80b-instruct", "--dtypes", "float32"])
        output = gather_data(args)
        # without factoring in `no_split` modules, the largest layer is 721420288 bytes
        assert output[0][1] != 721420288, "Largest layer calculation incorrect, did not factor in `no_split` modules."
        # the real answer is 3240165632 bytes
        assert output[0][1] == 3240165632

    @require_timm
    def test_timm_model(self):
        args = self.parser.parse_args(["timm/resnet50.a1_in1k", "--library_name", "timm"])
        output = gather_data(args)
        # The largest layer and total size of the model in bytes
        largest_layer, total_size = 9437184, 102441032
        assert largest_layer == output[0][1], (
            f"Calculation for largest layer size in `fp32` is incorrect, expected {largest_layer} but received {output[0][1]}"
        )
        assert total_size == output[0][2], (
            f"Calculation for total size in `fp32` is incorrect, expected {total_size} but received {output[0][2]}"
        )


class ToFSDP2Tester(unittest.TestCase):
    """
    Test case for verifying the `accelerate to-fsdp2` CLI outputs.
    """

    parser = to_fsdp2_command_parser()
    test_config_path = Path("tests/test_configs")

    @classmethod
    def setUpClass(cls):
        if (cls.test_config_path / "latest_fsdp.yaml").exists():
            cls.original_config = load_config_from_file(str(cls.test_config_path / "latest_fsdp.yaml"))

    @classmethod
    def tearDownClass(cls):
        if cls.original_config is not None:
            cls.original_config.to_yaml_file(str(cls.test_config_path / "latest_fsdp.yaml"))

    def tearDown(self):
        if (self.test_config_path / "output.yaml").exists():
            (self.test_config_path / "output.yaml").unlink()

    def test_nonexistent_config_file(self):
        with self.assertRaises(FileNotFoundError, msg="Config file `nonexistent.yaml` not found"):
            args = self.parser.parse_args(["--config_file", "nonexistent.yaml"])
            to_fsdp2_command(args)

    def test_no_output_without_overwrite(self):
        with self.assertRaises(ValueError, msg="If --overwrite is not set, --output_file must be provided"):
            args = self.parser.parse_args(["--config_file", str(self.test_config_path / "latest_fsdp.yaml")])
            to_fsdp2_command(args)

    @patch("pathlib.Path.exists")
    def test_overwrite_when_output_file_exists(self, mock_exists):
        mock_exists.side_effect = (
            lambda: str(mock_exists._mock_self) == "output.yaml" or mock_exists._mock_self.exists()
        )

        with self.assertRaises(
            FileExistsError, msg="Output file `output.yaml` already exists and --overwrite is not set"
        ):
            args = self.parser.parse_args(
                ["--config_file", str(self.test_config_path / "latest_fsdp.yaml"), "--output_file", "output.yaml"]
            )
            to_fsdp2_command(args)

    def test_fsdp2_config(self):
        args = self.parser.parse_args(
            [
                "--config_file",
                str(self.test_config_path / "latest_fsdp.yaml"),
                "--output_file",
                str(self.test_config_path / "output.yaml"),
            ]
        )
        to_fsdp2_command(args)

        config = load_config_from_file(str(self.test_config_path / "output.yaml"))
        assert isinstance(config, ClusterConfig)
        assert config.fsdp_config["fsdp_version"] == 2

    def test_config_already_fsdp2(self):
        args = self.parser.parse_args(
            [
                "--config_file",
                str(self.test_config_path / "latest_fsdp.yaml"),
                "--output_file",
                str(self.test_config_path / "output.yaml"),
            ]
        )

        mock_config = {"fsdp_config": {"fsdp_version": 2}}

        with patch("accelerate.commands.to_fsdp2.load_config", return_value=mock_config):
            with self.assertLogs(level="WARNING") as cm:
                to_fsdp2_command(args)

            assert "Config already specifies FSDP2, skipping conversion..." in cm.output[0]

    # Has to be the last test because it overwrites the config file
    def test_fsdp2_overwrite(self):
        args = self.parser.parse_args(
            [
                "--config_file",
                str(self.test_config_path / "latest_fsdp.yaml"),
                "--overwrite",
            ]
        )
        to_fsdp2_command(args)

        config = load_config_from_file(str(self.test_config_path / "latest_fsdp.yaml"))
        assert isinstance(config, ClusterConfig)
        assert config.fsdp_config["fsdp_version"] == 2


================================================
FILE: tests/test_compile.py
================================================
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from unittest import skip

import torch
from torch.utils.benchmark import Timer

from accelerate.test_utils import require_huggingface_suite, require_non_cpu, require_non_hpu, slow, torch_device
from accelerate.utils import compile_regions, extract_model_from_parallel, release_memory


MODEL_ID = "gpt2"

COMPILE_ITERS = 2
INFERENCE_ITERS = 100

INFRENCE_STMT = "model(input_ids, use_cache=False)"
COMPILE_STMT = f"torch._dynamo.reset(); torch._inductor.utils.clear_inductor_caches(); {INFRENCE_STMT}"

if torch_device == "hpu":
    backend = "hpu_backend"
else:
    backend = "inductor"


@require_huggingface_suite
@skip("Don't work with torch 2.8")
class RegionalCompilationTester(unittest.TestCase):
    def _get_model_and_inputs(self):
        from transformers import AutoConfig, AutoModelForCausalLM

        with torch.device(torch_device):
            config = AutoConfig.from_pretrained(MODEL_ID)
            model = AutoModelForCausalLM.from_config(config)
            input_ids = torch.randint(0, 1000, (4, 128), dtype=torch.int64)

        return model, input_ids

    def test_regions_are_compiled(self):
        model, _ = self._get_model_and_inputs()
        compiled_model = compile_regions(model, mode="reduce-overhead", backend=backend)

        # Check that the compiled model keeps a reference to the original model
        assert hasattr(compiled_model, "_orig_mod")
        assert compiled_model._orig_mod is model

        # Check that the compiled_model.transformer.h[i] and compiled_model.lm_head are compiled separately
        assert isinstance(compiled_model.transformer.h[0], torch._dynamo.eval_frame.OptimizedModule)
        assert isinstance(compiled_model.lm_head, torch._dynamo.eval_frame.OptimizedModule)
        assert compiled_model.transformer.h[0]._orig_mod is model.transformer.h[0]
        assert compiled_model.lm_head._orig_mod is model.lm_head

    def test_extract_model_keep_torch_compile(self):
        model, _ = self._get_model_and_inputs()
        compiled_model = compile_regions(model, mode="reduce-overhead", backend=backend)

        distributed_model = torch.nn.parallel.DataParallel(model)
        distributed_compiled_model = compile_regions(distributed_model, mode="reduce-overhead", backend=backend)
        compiled_model_unwrapped = extract_model_from_parallel(distributed_compiled_model, keep_torch_compile=True)

        assert compiled_model._orig_mod is compiled_model_unwrapped._orig_mod

    def test_extract_model_remove_torch_compile(self):
        model, _ = self._get_model_and_inputs()
        compiled_model = compile_regions(model, mode="reduce-overhead", backend=backend)

        distributed_model = torch.nn.parallel.DataParallel(model)
        distributed_compiled_model = compile_regions(distributed_model, mode="reduce-overhead", backend=backend)
        compiled_model_unwrapped = extract_model_from_parallel(distributed_compiled_model, keep_torch_compile=False)

        assert compiled_model._orig_mod is compiled_model_unwrapped

    @require_non_cpu
    @require_huggingface_suite
    def test_regional_compilation_cold_start(self):
        model, input_ids = self._get_model_and_inputs()

        regional_compilation_model = compile_regions(model, backend=backend)
        regional_compilation_cold_start = (
            Timer(stmt=COMPILE_STMT, globals={"model": regional_compilation_model, "input_ids": input_ids})
            .timeit(COMPILE_ITERS)
            .median
        )

        full_compilation_model = torch.compile(model, backend=backend)
        full_compilation_cold_start = (
            Timer(stmt=COMPILE_STMT, globals={"model": full_compilation_model, "input_ids": input_ids})
            .timeit(COMPILE_ITERS)
            .median
        )

        self.assertLess(
            regional_compilation_cold_start,
            full_compilation_cold_start,
            "Regional compilation should have a faster cold start than full compilation",
        )

        release_memory(model, full_compilation_model, regional_compilation_model)

    @slow
    @require_non_hpu
    @require_non_cpu
    @require_huggingface_suite
    def test_regional_compilation_inference_speedup(self):
        model, input_ids = self._get_model_and_inputs()

        baseline_inference_latency = (
            Timer(stmt=INFRENCE_STMT, globals={"model": model, "input_ids": input_ids}).timeit(INFERENCE_ITERS).median
        )

        regional_compilation_model = compile_regions(model, backend=backend)
        regional_compilation_inference_latency = (
            Timer(stmt=INFRENCE_STMT, globals={"model": regional_compilation_model, "input_ids": input_ids})
            .timeit(INFERENCE_ITERS)
            .median
        )

        full_compilation_model = torch.compile(model, backend=backend)
        full_compilation_inference_latency = (
            Timer(stmt=INFRENCE_STMT, globals={"model": full_compilation_model, "input_ids": input_ids})
            .timeit(INFERENCE_ITERS)
            .median
        )

        full_compilation_inference_speedup = baseline_inference_latency / full_compilation_inference_latency
        regional_compilation_inference_speedup = baseline_inference_latency / regional_compilation_inference_latency

        self.assertAlmostEqual(
            regional_compilation_inference_speedup,
            full_compilation_inference_speedup,
            delta=0.1,
            msg="Regional compilation should have a similar speedup to full compilation",
        )

        release_memory(model, full_compilation_model, regional_compilation_model)


================================================
FILE: tests/test_configs/0_11_0.yaml
================================================
compute_environment: LOCAL_MACHINE
deepspeed_config: {}
distributed_type: 'NO'
fsdp_config: {}
machine_rank: 0
main_process_ip: null
main_process_port: null
main_training_function: main
mixed_precision: 'no'
num_machines: 1
num_processes: 1
use_cpu: false

================================================
FILE: tests/test_configs/0_12_0.yaml
================================================
compute_environment: LOCAL_MACHINE
deepspeed_config: {}
distributed_type: 'NO'
downcast_bf16: 'no'
fsdp_config: {}
machine_rank: 0
main_process_ip: null
main_process_port: null
main_training_function: main
mixed_precision: 'no'
num_machines: 1
num_processes: 1
use_cpu: false

================================================
FILE: tests/test_configs/0_28_0_mpi.yaml
================================================
compute_environment: LOCAL_MACHINE
debug: false
distributed_type: MULTI_CPU
downcast_bf16: 'no'
machine_rank: 0
main_process_ip: 127.0.0.1
main_process_port: 29500
main_training_function: main
mixed_precision: 'no'
mpirun_config:
  mpirun_hostfile: /home/user/hostfile
num_machines: 4
num_processes: 16
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: true


================================================
FILE: tests/test_configs/0_30_0_sagemaker.yaml
================================================
compute_environment: AMAZON_SAGEMAKER
debug: false
distributed_type: NO
mixed_precision: fp16
debug: false
use_cpu: false
ec2_instance_type: MY_TYPE
iam_role_name: MY_ROLE


================================================
FILE: tests/test_configs/0_34_0_fp8.yaml
================================================
compute_environment: LOCAL_MACHINE
debug: false
distributed_type: MULTI_GPU
downcast_bf16: 'no'
enable_cpu_affinity: false
fp8_config:
  amax_compute_algo: max
  amax_history_len: 1024
  backend: TE
  fp8_format: E4M3
  interval: 1
  margin: 0
  override_linear_precision: (false, false, false)
  use_autocast_during_eval: false
gpu_ids: all
machine_rank: 0
main_training_function: main
mixed_precision: fp8
num_machines: 1
num_processes: 2
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false


================================================
FILE: tests/test_configs/README.md
================================================
This folder contains test configs for `accelerate config`. These should be generated for each major version
and are written based on `accelerate config` and selecting the "No distributed training" option.

================================================
FILE: tests/test_configs/invalid_keys.yaml
================================================
compute_environment: LOCAL_MACHINE
deepspeed_config: {}
distributed_type: 'NO'
downcast_bf16: 'no'
fsdp_config: {}
machine_rank: 0
main_process_ip: null
main_process_port: null
main_training_function: main
mixed_precision: 'no'
num_machines: 1
num_processes: 1
use_cpu: false
invalid_key: "invalid_value"
another_invalid_key: "another_invalid_value"

================================================
FILE: tests/test_configs/latest.yaml
================================================
compute_environment: LOCAL_MACHINE
deepspeed_config: {}
distributed_type: 'NO'
downcast_bf16: 'no'
fsdp_config: {}
gpu_ids: all
machine_rank: 0
main_process_ip: null
main_process_port: null
main_training_function: main
megatron_lm_config: {}
mixed_precision: 'no'
num_machines: 1
num_processes: 1
rdzv_backend: static
same_network: true
use_cpu: false
tpu_name: 'test-tpu'
tpu_zone: 'us-central1-a'
commands: null
command_file: tests/test_samples/test_command_file.sh

================================================
FILE: tests/test_configs/latest_fsdp.yaml
================================================
compute_environment: LOCAL_MACHINE
debug: false
distributed_type: FSDP
downcast_bf16: 'no'
enable_cpu_affinity: false
fsdp_config:
  fsdp_activation_checkpointing: false
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_backward_prefetch: BACKWARD_PRE
  fsdp_cpu_ram_efficient_loading: true
  fsdp_forward_prefetch: false
  fsdp_ignored_modules: null
  fsdp_offload_params: false
  fsdp_sharding_strategy: FULL_SHARD
  fsdp_state_dict_type: SHARDED_STATE_DICT
  fsdp_sync_module_states: true
  fsdp_transformer_layer_cls_to_wrap: BertLayer
  fsdp_use_orig_params: true
machine_rank: 0
main_training_function: main
mixed_precision: 'no'
num_machines: 1
num_processes: 1
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false


================================================
FILE: tests/test_configs/validate_launch_cmd.yaml
================================================
compute_environment: LOCAL_MACHINE
debug: true
num_processes: 1
distributed_type: 'NO'
fsdp_config:
  fsdp_sync_module_states: false
deepspeed_config:
  deepspeed_config_file: path/to/be/ignored


================================================
FILE: tests/test_cpu.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest

from accelerate import debug_launcher
from accelerate.test_utils import require_cpu, test_ops, test_script


@require_cpu
class MultiCPUTester(unittest.TestCase):
    def test_cpu(self):
        debug_launcher(test_script.main)

    def test_ops(self):
        debug_launcher(test_ops.main)


================================================
FILE: tests/test_data_loader.py
================================================
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import random
import weakref

import pytest
import torch
from parameterized import parameterized
from torch.utils.data import BatchSampler, DataLoader, IterableDataset

from accelerate import Accelerator, PartialState
from accelerate.data_loader import (
    BatchSamplerShard,
    DataLoaderDispatcher,
    DataLoaderShard,
    DataLoaderStateMixin,
    IterableDatasetShard,
    SkipBatchSampler,
    SkipDataLoader,
    prepare_data_loader,
    skip_first_batches,
)
from accelerate.state import GradientState
from accelerate.test_utils.testing import AccelerateTestCase, require_datasets, require_torchdata_stateful_dataloader
from accelerate.utils import is_torchdata_stateful_dataloader_available, set_seed


if is_torchdata_stateful_dataloader_available():
    from torchdata.stateful_dataloader import (
        StatefulDataLoader,
    )


def parameterized_custom_name_func(func, param_num, param):
    # customize the test name generator function as we want both params to appear in the sub-test
    # name, as by default it shows only the first param
    param_based_name = f"num_workers_{param.args[0]}"
    return f"{func.__name__}_{param_based_name}"


class RandomIterableDataset(IterableDataset):
    # For testing, an iterable dataset of random length
    def __init__(self, p_stop=0.01, max_length=1000):
        self.p_stop = p_stop
        self.max_length = max_length

    def __iter__(self):
        count = 0
        stop = False
        while not stop and count < self.max_length:
            yield count
            count += 1
            stop = random.random() < self.p_stop


class SimpleIterableDataset(IterableDataset):
    def __init__(self, num_samples=1000):
        self.num_samples = num_samples

    def __iter__(self):
        for _ in range(self.num_samples):
            yield torch.rand(1)

    def __len__(self):
        return self.num_samples

    def set_epoch(self, epoch):
        self.epoch = epoch


class SimpleBatchSampler(BatchSampler):
    def __init__(self, sampler, batch_size, drop_last, generator, seed):
        super().__init__(sampler, batch_size, drop_last)
        self.generator = generator
        self.seed = seed
        self.epoch = 0

    def __iter__(self):
        self.generator.manual_seed(self.seed + self.epoch)
        return super().__iter__()

    def set_epoch(self, epoch):
        self.epoch = epoch


class DataLoaderTester(AccelerateTestCase):
    def check_batch_sampler_shards(self, batch_sampler, expected, split_batches=False, even_batches=True):
        batch_sampler_shards = [
            BatchSamplerShard(batch_sampler, 2, i, split_batches=split_batches, even_batches=even_batches)
            for i in range(2)
        ]
        batch_sampler_lists = [list(batch_sampler_shard) for batch_sampler_shard in batch_sampler_shards]
        if not split_batches:
            assert [len(shard) for shard in batch_sampler_shards] == [len(e) for e in expected]
        assert batch_sampler_lists == expected

    def test_batch_sampler_shards_with_no_splits(self):
        # Check the shards when the dataset is a round multiple of total batch size.
        batch_sampler = BatchSampler(range(24), batch_size=3, drop_last=False)
        expected = [
            [[0, 1, 2], [6, 7, 8], [12, 13, 14], [18, 19, 20]],
            [[3, 4, 5], [9, 10, 11], [15, 16, 17], [21, 22, 23]],
        ]
        self.check_batch_sampler_shards(batch_sampler, expected)

        batch_sampler = BatchSampler(range(24), batch_size=3, drop_last=True)
        # Expected shouldn't change
        self.check_batch_sampler_shards(batch_sampler, expected)

        # Check the shards when the dataset is a round multiple of batch size but not total batch size.
        batch_sampler = BatchSampler(range(21), batch_size=3, drop_last=False)
        expected = [
            [[0, 1, 2], [6, 7, 8], [12, 13, 14], [18, 19, 20]],
            [[3, 4, 5], [9, 10, 11], [15, 16, 17], [0, 1, 2]],
        ]
        self.check_batch_sampler_shards(batch_sampler, expected)

        batch_sampler = BatchSampler(range(21), batch_size=3, drop_last=True)
        expected = [
            [[0, 1, 2], [6, 7, 8], [12, 13, 14]],
            [[3, 4, 5], [9, 10, 11], [15, 16, 17]],
        ]
        self.check_batch_sampler_shards(batch_sampler, expected)

        # Check the shards when the dataset is not a round multiple of batch size but has a multiple of
        # num_processes batch.
        batch_sampler = BatchSampler(range(22), batch_size=3, drop_last=False)
        expected = [
            [[0, 1, 2], [6, 7, 8], [12, 13, 14], [18, 19, 20]],
            [[3, 4, 5], [9, 10, 11], [15, 16, 17], [21, 0, 1]],
        ]
        self.check_batch_sampler_shards(batch_sampler, expected)

        batch_sampler = BatchSampler(range(22), batch_size=3, drop_last=True)
        expected = [
            [[0, 1, 2], [6, 7, 8], [12, 13, 14]],
            [[3, 4, 5], [9, 10, 11], [15, 16, 17]],
        ]
        self.check_batch_sampler_shards(batch_sampler, expected)

        # Check the shards when the dataset is not a round multiple of batch size but and has not a multiple of
        # num_processes batch.
        batch_sampler = BatchSampler(range(20), batch_size=3, drop_last=False)
        expected = [
            [[0, 1, 2], [6, 7, 8], [12, 13, 14], [18, 19, 0]],
            [[3, 4, 5], [9, 10, 11], [15, 16, 17], [1, 2, 3]],
        ]
        self.check_batch_sampler_shards(batch_sampler, expected)

        batch_sampler = BatchSampler(range(20), batch_size=3, drop_last=True)
        expected = [
            [[0, 1, 2], [6, 7, 8], [12, 13, 14]],
            [[3, 4, 5], [9, 10, 11], [15, 16, 17]],
        ]
        self.check_batch_sampler_shards(batch_sampler, expected)

        # Check the shards when the dataset is very small.
        batch_sampler = BatchSampler(range(2), batch_size=3, drop_last=False)
        expected = [[[0, 1, 0]], [[1, 0, 1]]]
        self.check_batch_sampler_shards(batch_sampler, expected)

        batch_sampler = BatchSampler(range(2), batch_size=3, drop_last=True)
        expected = [[], []]
        self.check_batch_sampler_shards(batch_sampler, expected)

    def test_batch_sampler_shards_with_splits(self):
        # Check the shards when the dataset is a round multiple of batch size.
        batch_sampler = BatchSampler(range(24), batch_size=4, drop_last=False)
        expected = [
            [[0, 1], [4, 5], [8, 9], [12, 13], [16, 17], [20, 21]],
            [[2, 3], [6, 7], [10, 11], [14, 15], [18, 19], [22, 23]],
        ]
        self.check_batch_sampler_shards(batch_sampler, expected, split_batches=True)

        batch_sampler = BatchSampler(range(24), batch_size=4, drop_last=True)
        # Expected shouldn't change
        self.check_batch_sampler_shards(batch_sampler, expected, split_batches=True)

        # Check the shards when the dataset is not a round multiple of batch size.
        batch_sampler = BatchSampler(range(22), batch_size=4, drop_last=False)
        expected = [
            [[0, 1], [4, 5], [8, 9], [12, 13], [16, 17], [20, 21]],
            [[2, 3], [6, 7], [10, 11], [14, 15], [18, 19], [0, 1]],
        ]
        self.check_batch_sampler_shards(batch_sampler, expected, split_batches=True)

        batch_sampler = BatchSampler(range(22), batch_size=4, drop_last=True)
        expected = [
            [[0, 1], [4, 5], [8, 9], [12, 13], [16, 17]],
            [[2, 3], [6, 7], [10, 11], [14, 15], [18, 19]],
        ]
        self.check_batch_sampler_shards(batch_sampler, expected, split_batches=True)

        # Check the shards when the dataset is not a round multiple of batch size or num_processes.
        batch_sampler = BatchSampler(range(21), batch_size=4, drop_last=False)
        expected = [
            [[0, 1], [4, 5], [8, 9], [12, 13], [16, 17], [20, 0]],
            [[2, 3], [6, 7], [10, 11], [14, 15], [18, 19], [1, 2]],
        ]
        self.check_batch_sampler_shards(batch_sampler, expected, split_batches=True)

        batch_sampler = BatchSampler(range(21), batch_size=4, drop_last=True)
        expected = [
            [[0, 1], [4, 5], [8, 9], [12, 13], [16, 17]],
            [[2, 3], [6, 7], [10, 11], [14, 15], [18, 19]],
        ]
        self.check_batch_sampler_shards(batch_sampler, expected, split_batches=True)

        # Check the shards when the dataset is very small.
        batch_sampler = BatchSampler(range(2), batch_size=4, drop_last=False)
        expected = [[[0, 1]], [[0, 1]]]
        self.check_batch_sampler_shards(batch_sampler, expected, split_batches=True)

        batch_sampler = BatchSampler(range(2), batch_size=4, drop_last=True)
        expected = [[], []]
        self.check_batch_sampler_shards(batch_sampler, expected, split_batches=True)

    def test_batch_sampler_shards_with_no_splits_no_even(self):
        # Check the shards when the dataset is a round multiple of total batch size.
        batch_sampler = BatchSampler(range(24), batch_size=3, drop_last=False)
        expected = [
            [[0, 1, 2], [6, 7, 8], [12, 13, 14], [18, 19, 20]],
            [[3, 4, 5], [9, 10, 11], [15, 16, 17], [21, 22, 23]],
        ]
        self.check_batch_sampler_shards(batch_sampler, expected, even_batches=False)

        batch_sampler = BatchSampler(range(24), batch_size=3, drop_last=True)
        # Expected shouldn't change
        self.check_batch_sampler_shards(batch_sampler, expected, even_batches=False)

        # Check the shards when the dataset is a round multiple of batch size but not total batch size.
        batch_sampler = BatchSampler(range(21), batch_size=3, drop_last=False)
        expected = [
            [[0, 1, 2], [6, 7, 8], [12, 13, 14], [18, 19, 20]],
            [[3, 4, 5], [9, 10, 11], [15, 16, 17]],
        ]
        self.check_batch_sampler_shards(batch_sampler, expected, even_batches=False)

        batch_sampler = BatchSampler(range(21), batch_size=3, drop_last=True)
        expected = [
            [[0, 1, 2], [6, 7, 8], [12, 13, 14]],
            [[3, 4, 5], [9, 10, 11], [15, 16, 17]],
        ]
        self.check_batch_sampler_shards(batch_sampler, expected, even_batches=False)

        # Check the shards when the dataset is not a round multiple of batch size but has a multiple of
        # num_processes batch.
        batch_sampler = BatchSampler(range(22), batch_size=3, drop_last=False)
        expected = [
            [[0, 1, 2], [6, 7, 8], [12, 13, 14], [18, 19, 20]],
            [[3, 4, 5], [9, 10, 11], [15, 16, 17], [21]],
        ]
        self.check_batch_sampler_shards(batch_sampler, expected, even_batches=False)

        batch_sampler = BatchSampler(range(22), batch_size=3, drop_last=True)
        expected = [
            [[0, 1, 2], [6, 7, 8], [12, 13, 14]],
            [[3, 4, 5], [9, 10, 11], [15, 16, 17]],
        ]
        self.check_batch_sampler_shards(batch_sampler, expected, even_batches=False)

        # Check the shards when the dataset is not a round multiple of batch size but and has not a multiple of
        # num_processes batch.
        batch_sampler = BatchSampler(range(20), batch_size=3, drop_last=False)
        expected = [
            [[0, 1, 2], [6, 7, 8], [12, 13, 14], [18, 19]],
            [[3, 4, 5], [9, 10, 11], [15, 16, 17]],
        ]
        self.check_batch_sampler_shards(batch_sampler, expected, even_batches=False)

        batch_sampler = BatchSampler(range(20), batch_size=3, drop_last=True)
        expected = [
            [[0, 1, 2], [6, 7, 8], [12, 13, 14]],
            [[3, 4, 5], [9, 10, 11], [15, 16, 17]],
        ]
        self.check_batch_sampler_shards(batch_sampler, expected, even_batches=False)

        # Check the shards when the dataset is very small.
        batch_sampler = BatchSampler(range(2), batch_size=3, drop_last=False)
        expected = [[[0, 1]], []]
        self.check_batch_sampler_shards(batch_sampler, expected, even_batches=False)

        batch_sampler = BatchSampler(range(2), batch_size=3, drop_last=True)
        expected = [[], []]
        self.check_batch_sampler_shards(batch_sampler, expected, even_batches=False)

    def test_batch_sampler_shards_with_splits_no_even(self):
        # Check the shards when the dataset is a round multiple of batch size.
        batch_sampler = BatchSampler(range(24), batch_size=4, drop_last=False)
        expected = [
            [[0, 1], [4, 5], [8, 9], [12, 13], [16, 17], [20, 21]],
            [[2, 3], [6, 7], [10, 11], [14, 15], [18, 19], [22, 23]],
        ]
        self.check_batch_sampler_shards(batch_sampler, expected, split_batches=True, even_batches=False)

        batch_sampler = BatchSampler(range(24), batch_size=4, drop_last=True)
        # Expected shouldn't change
        self.check_batch_sampler_shards(batch_sampler, expected, split_batches=True, even_batches=False)

        # Check the shards when the dataset is not a round multiple of batch size.
        batch_sampler = BatchSampler(range(22), batch_size=4, drop_last=False)
        expected = [
            [[0, 1], [4, 5], [8, 9], [12, 13], [16, 17], [20, 21]],
            [[2, 3], [6, 7], [10, 11], [14, 15], [18, 19]],
        ]
        self.check_batch_sampler_shards(batch_sampler, expected, split_batches=True, even_batches=False)

        batch_sampler = BatchSampler(range(22), batch_size=4, drop_last=True)
        expected = [
            [[0, 1], [4, 5], [8, 9], [12, 13], [16, 17]],
            [[2, 3], [6, 7], [10, 11], [14, 15], [18, 19]],
        ]
        self.check_batch_sampler_shards(batch_sampler, expected, split_batches=True, even_batches=False)

        # Check the shards when the dataset is not a round multiple of batch size or num_processes.
        batch_sampler = BatchSampler(range(21), batch_size=4, drop_last=False)
        expected = [
            [[0, 1], [4, 5], [8, 9], [12, 13], [16, 17], [20]],
            [[2, 3], [6, 7], [10, 11], [14, 15], [18, 19]],
        ]
        self.check_batch_sampler_shards(batch_sampler, expected, split_batches=True, even_batches=False)

        batch_sampler = BatchSampler(range(21), batch_size=4, drop_last=True)
        expected = [
            [[0, 1], [4, 5], [8, 9], [12, 13], [16, 17]],
            [[2, 3], [6, 7], [10, 11], [14, 15], [18, 19]],
        ]
        self.check_batch_sampler_shards(batch_sampler, expected, split_batches=True, even_batches=False)

        # Check the shards when the dataset is very small.
        batch_sampler = BatchSampler(range(2), batch_size=4, drop_last=False)
        expected = [[[0, 1]], []]
        self.check_batch_sampler_shards(batch_sampler, expected, split_batches=True, even_batches=False)

        batch_sampler = BatchSampler(range(2), batch_size=4, drop_last=True)
        expected = [[], []]
        self.check_batch_sampler_shards(batch_sampler, expected, split_batches=True, even_batches=False)

    def test_batch_sampler_with_varying_batch_size(self):
        batch_sampler = [[0, 1, 2], [3, 4], [5, 6, 7, 8], [9, 10, 11], [12, 13]]
        batch_sampler_shards = [BatchSamplerShard(batch_sampler, 2, i, even_batches=False) for i in range(2)]

        assert len(batch_sampler_shards[0]) == 3
        assert len(batch_sampler_shards[1]) == 2

        assert list(batch_sampler_shards[0]) == [[0, 1, 2], [5, 6, 7, 8], [12, 13]]
        assert list(batch_sampler_shards[1]) == [[3, 4], [9, 10, 11]]

    def check_iterable_dataset_shards(
        self, dataset, seed, batch_size, drop_last=False, num_processes=2, split_batches=False
    ):
        random.seed(seed)
        reference = list(dataset)

        iterable_dataset_shards = [
            IterableDatasetShard(
                dataset,
                batch_size=batch_size,
                drop_last=drop_last,
                num_processes=num_processes,
                process_index=i,
                split_batches=split_batches,
            )
            for i in range(num_processes)
        ]
        iterable_dataset_lists = []
        for iterable_dataset_shard in iterable_dataset_shards:
            # Since our random iterable dataset will be... random... we need to use a seed to get reproducible results.
            random.seed(seed)
            iterable_dataset_lists.append(list(iterable_dataset_shard))

        shard_batch_size = batch_size // num_processes if split_batches else batch_size
        # All iterable dataset shard should have the same length, a round multiple of shard_batch_size
        first_list = iterable_dataset_lists[0]
        for l in iterable_dataset_lists[1:]:
            assert len(l) == len(first_list)
            assert (len(l) % shard_batch_size) == 0

        observed = []
        for idx in range(0, len(first_list), shard_batch_size):
            for l in iterable_dataset_lists:
                observed += l[idx : idx + shard_batch_size]

        if not drop_last:
            while len(reference) < len(observed):
                reference += reference
        assert observed == reference[: len(observed)]

    def test_iterable_dataset_shard(self):
        seed = 42
        dataset = RandomIterableDataset()

        self.check_iterable_dataset_shards(dataset, seed, batch_size=4, drop_last=False, split_batches=False)
        self.check_iterable_dataset_shards(dataset, seed, batch_size=4, drop_last=True, split_batches=False)
        self.check_iterable_dataset_shards(dataset, seed, batch_size=4, drop_last=False, split_batches=True)
        self.check_iterable_dataset_shards(dataset, seed, batch_size=4, drop_last=True, split_batches=True)

        # Edge case with a very small dataset
        dataset = RandomIterableDataset(max_length=2)

        self.check_iterable_dataset_shards(dataset, seed, batch_size=4, drop_last=False, split_batches=False)
        self.check_iterable_dataset_shards(dataset, seed, batch_size=4, drop_last=True, split_batches=False)
        self.check_iterable_dataset_shards(dataset, seed, batch_size=4, drop_last=False, split_batches=True)
        self.check_iterable_dataset_shards(dataset, seed, batch_size=4, drop_last=True, split_batches=True)

    def test_iterable_dataset_using_none_batch_size(self):
        dataset = SimpleIterableDataset(100)
        dataloader = DataLoader(dataset, batch_size=None)
        dataloader = prepare_data_loader(dataloader)
        for d in dataloader:
            assert isinstance(d, torch.Tensor)

    def test_iterable_dataset_with_non_tensor_samples(self):
        dataset = SimpleIterableDataset(10)

        def collate_fn(features):
            return {
                "tensor": torch.stack(features),
                "non_tensor": "non_tensor_value",
            }

        dataloader = DataLoader(dataset, batch_size=4, collate_fn=collate_fn)
        accelerator = Accelerator()
        dataloader = accelerator.prepare_data_loader(dataloader)
        for d in dataloader:
            assert isinstance(d["tensor"], torch.Tensor)
            assert d["non_tensor"] == "non_tensor_value"

    @parameterized.expand([1, 2], name_func=parameterized_custom_name_func)
    def test_reproducibility(self, num_processes):
        set_seed(21)
        dataset = list(range(6))
        dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
        dataloader = prepare_data_loader(dataloader, num_processes=num_processes)
        vals_1 = []
        for val in dataloader:
            vals_1.append(val)

        # check same order for same seed
        set_seed(21)
        dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
        dataloader = prepare_data_loader(dataloader, num_processes=num_processes)
        vals_2 = []
        for val in dataloader:
            vals_2.append(val)

        assert vals_1 == vals_2

        # check different order for different seed
        set_seed(42)
        dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
        dataloader = prepare_data_loader(dataloader, num_processes=num_processes)
        vals_3 = []
        for val in dataloader:
            vals_3.append(val)

        assert vals_1 != vals_3

    def test_skip_batch_sampler(self):
        batch_sampler = BatchSampler(range(16), batch_size=4, drop_last=False)
        new_batch_sampler = SkipBatchSampler(batch_sampler, 2)
        assert list(new_batch_sampler) == [[8, 9, 10, 11], [12, 13, 14, 15]]

    def test_dataloader_inheritance(self):
        """
        `DataLoaderAdapter`'s parent classes are dynamically constructed, assert that subclasses of DataLoaderAdapter
        are instances of DataLoader and DataLoaderStateMixin.
        """
        skip_dl = SkipDataLoader(range(16), batch_size=4, skip_batches=2)
        dl_shard = DataLoaderShard(range(16), batch_size=4)
        dl_dispatcher = DataLoaderDispatcher(range(16), batch_size=4)

        # Test dataloaders are instances of instantiated classes
        # These asserts look redundant, but it's worth checking since we are doing magic tricks such as dynamically overriding __class__
        assert isinstance(skip_dl, SkipDataLoader)
        assert isinstance(dl_shard, DataLoaderShard)
        assert isinstance(dl_dispatcher, DataLoaderDispatcher)

        # Test dataloaders are instances of base classes
        assert isinstance(skip_dl, DataLoader)
        assert isinstance(dl_shard, DataLoader)
        assert isinstance(dl_dispatcher, DataLoader)

        assert isinstance(dl_shard, DataLoaderStateMixin)
        assert isinstance(dl_dispatcher, DataLoaderStateMixin)

        assert isinstance(skip_dl.base_dataloader, DataLoader)
        assert isinstance(dl_shard.base_dataloader, DataLoader)
        assert isinstance(dl_dispatcher.base_dataloader, DataLoader)

        with pytest.raises(AttributeError):
            _ = DataLoaderShard.base_dataloader

    def test_skip_data_loader(self):
        dataloader = SkipDataLoader(list(range(16)), batch_size=4, skip_batches=2)
        assert [t.tolist() for t in dataloader] == [[8, 9, 10, 11], [12, 13, 14, 15]]

    def test_skip_first_batches(self):
        dataloader = DataLoader(list(range(16)), batch_size=4)
        new_dataloader = skip_first_batches(dataloader, num_batches=2)
        assert [t.tolist() for t in new_dataloader] == [[8, 9, 10, 11], [12, 13, 14, 15]]

    def test_end_of_dataloader(self):
        dataloader = DataLoaderShard(list(range(16)), batch_size=4)
        for idx, _ in enumerate(dataloader):
            assert dataloader.end_of_dataloader == (idx == 3)

        # Test it also works on the second iteration
        for idx, _ in enumerate(dataloader):
            assert dataloader.end_of_dataloader == (idx == 3)

    def test_end_of_dataloader_dispatcher(self):
        dataloader = DataLoaderDispatcher(range(16), batch_size=4)
        for idx, _ in enumerate(dataloader):
            assert dataloader.end_of_dataloader == (idx == 3)

        # Test it also works on the second iteration
        for idx, _ in enumerate(dataloader):
            assert dataloader.end_of_dataloader == (idx == 3)

    def test_set_epoch_in_batch_sampler(self):
        # Ensure that set_epoch gets propagated to custom batch samplers that accept it
        dataset = list(range(16))
        generator = torch.Generator()
        batch_sampler = SimpleBatchSampler(dataset, batch_size=4, drop_last=False, generator=generator, seed=12)
        dataloader = DataLoader(dataset, batch_sampler=batch_sampler)

        accelerator = Accelerator()
        dataloader = accelerator.prepare_data_loader(dataloader)

        assert batch_sampler.epoch == 0
        dataloader.set_epoch(1)
        assert batch_sampler.epoch == 1

    @require_datasets
    def test_iterable_dataset_native_sharding_when_n_shards_equals_num_processes(self):
        """When n_shards == num_processes, native HF dataset sharding should be used."""
        from datasets import Dataset

        ds = Dataset.from_dict({"x": list(range(10))}).to_iterable_dataset(num_shards=2)
        assert ds.n_shards == 2

        dataloader = DataLoader(ds, batch_size=4)
        result = prepare_data_loader(dataloader, num_processes=2, process_index=0, dispatch_batches=False)

        # n_shards (2) == num_processes (2): should use native sharding, not IterableDatasetShard
        assert not isinstance(result.dataset, IterableDatasetShard)

    def test_ensure_dataloader_gets_cleaned_up(self):
        # Ensure that the dataloader gets cleaned up properly
        class Dummy:
            def __init__(self):
                dataset = list(range(16))
                dataloader = DataLoader(dataset, batch_size=4)

                self.accelerator = Accelerator()
                self.dataloader = self.accelerator.prepare_data_loader(dataloader)

                self.iter = iter(self.dataloader)

            def __call__(self, *args, **kwds):
                return next(self.iter)

        instance = Dummy()
        assert instance().tolist() == [0, 1, 2, 3]

        # Create weak references to the objects that *should* be cleaned up if the instance is deleted
        accelerator_ref = weakref.ref(instance.accelerator)
        dataloader_ref = weakref.ref(instance.dataloader)
        gradient_state_ref = weakref.ref(instance.dataloader.gradient_state)

        del instance

        assert accelerator_ref() is None
        assert dataloader_ref() is None
        assert gradient_state_ref() is None


class StatefulDataLoaderTester(AccelerateTestCase):
    @require_torchdata_stateful_dataloader
    def test_skip_data_loader(self):
        dataloader = SkipDataLoader(list(range(16)), batch_size=4, skip_batches=2, use_stateful_dataloader=True)
        assert isinstance(dataloader, StatefulDataLoader)
        assert [t.tolist() for t in dataloader] == [[8, 9, 10, 11], [12, 13, 14, 15]]

    @require_torchdata_stateful_dataloader
    def test_end_of_dataloader(self):
        dataloader = DataLoaderShard(list(range(16)), batch_size=4, use_stateful_dataloader=True)
        assert dataloader.use_stateful_dataloader
        assert isinstance(dataloader, StatefulDataLoader)
        for idx, _ in enumerate(dataloader):
            assert dataloader.end_of_dataloader == (idx == 3)

        # Test it also works on the second iteration
        for idx, _ in enumerate(dataloader):
            assert dataloader.end_of_dataloader == (idx == 3)

    @require_torchdata_stateful_dataloader
    def test_end_of_dataloader_dispatcher(self):
        dataloader = DataLoaderDispatcher(range(16), batch_size=4, use_stateful_dataloader=True)
        assert isinstance(dataloader, StatefulDataLoader)
        for idx, _ in enumerate(dataloader):
            assert dataloader.end_of_dataloader == (idx == 3)

        # Test it also works on the second iteration
        for idx, _ in enumerate(dataloader):
            assert dataloader.end_of_dataloader == (idx == 3)

    @parameterized.expand([0, 2], name_func=parameterized_custom_name_func)
    @require_torchdata_stateful_dataloader
    def test_dataloader_state_dict(self, num_workers):
        """
        Test that saving a stateful dataloader's state, then loading it back, gives the same results.
        """
        dataset = list(range(16))
        dataloader = DataLoaderShard(dataset, batch_size=4, use_stateful_dataloader=True, num_workers=num_workers)

        assert dataloader.use_stateful_dataloader
        assert isinstance(dataloader, StatefulDataLoader)
        vals = []
        for idx, val in enumerate(dataloader):
            vals.append(val)
            if idx == 1:
                sd = dataloader.state_dict()
        assert len(vals) == 4

        dataloader2 = DataLoaderShard(dataset, batch_size=4, use_stateful_dataloader=True, num_workers=num_workers)
        dataloader2.load_state_dict(sd)

        data1 = vals[2:]
        data2 = list(dataloader2)
        assert len(data1) == len(data2)
        for d1, d2 in zip(data1, data2):
            assert torch.allclose(d1, d2)

    @parameterized.expand([0, 2], name_func=parameterized_custom_name_func)
    @require_torchdata_stateful_dataloader
    def test_dataloader_dispatcher_state_dict(self, num_workers):
        """
        Test that saving a stateful dataloader's state, then loading it back, gives the same results.
        """
        dataset = list(range(16))
        dataloader = DataLoaderDispatcher(dataset, batch_size=4, use_stateful_dataloader=True, num_workers=num_workers)

        assert dataloader.use_stateful_dataloader
        assert isinstance(dataloader, StatefulDataLoader)
        vals = []
        for idx, val in enumerate(dataloader):
            vals.append(val)
            if idx == 1:
                sd = dataloader.state_dict()
        assert len(vals) == 4
        dataloader2 = DataLoaderDispatcher(
            dataset, batch_size=4, use_stateful_dataloader=True, num_workers=num_workers
        )
        dataloader2.load_state_dict(sd)

        data1 = vals[2:]
        data2 = list(dataloader2)
        assert len(data1) == len(data2)
        for d1, d2 in zip(data1, data2):
            assert torch.allclose(d1, d2)

    @require_torchdata_stateful_dataloader
    def test_dataloader_inheritance(self):
        """
        `DataLoaderAdapter`'s parent classes are dynamically constructed, assert that if use_stateful_dataloader=True,
        subclasses of DataLoaderAdapter are instances of StatefulDataLoader and DataLoaderStateMixin.
        """
        skip_dl = SkipDataLoader(range(16), batch_size=4, skip_batches=2, use_stateful_dataloader=True)
        dl_shard = DataLoaderShard(range(16), batch_size=4, use_stateful_dataloader=True)
        dl_dispatcher = DataLoaderDispatcher(range(16), batch_size=4, use_stateful_dataloader=True)

        # Test dataloaders are instances of instantiated classes
        # These asserts look redundant, but it's worth checking since we are doing magic tricks such as dynamically overriding __class__
        assert isinstance(skip_dl, SkipDataLoader)
        assert isinstance(dl_shard, DataLoaderShard)
        assert isinstance(dl_dispatcher, DataLoaderDispatcher)

        assert isinstance(skip_dl, StatefulDataLoader)
        assert isinstance(dl_shard, StatefulDataLoader)
        assert isinstance(dl_dispatcher, StatefulDataLoader)

        assert isinstance(dl_shard, DataLoaderStateMixin)
        assert isinstance(dl_dispatcher, DataLoaderStateMixin)

        assert isinstance(skip_dl.base_dataloader, StatefulDataLoader)
        assert isinstance(dl_shard.base_dataloader, StatefulDataLoader)
        assert isinstance(dl_dispatcher.base_dataloader, StatefulDataLoader)

    @parameterized.expand([0, 2], name_func=parameterized_custom_name_func)
    @require_torchdata_stateful_dataloader
    def test_stateful_dataloader_adapter_equivalent_to_torchdata_stateful_dataloader(self, num_workers):
        """
        Assert that `state_dict()` and `load_state_dict()` for derived subclasses of `DataLoaderAdapter` produce
        the same behavior as `state_dict()` and `load_state_dict()` for `StatefulDataLoader`.
        """
        dataset = list(range(64))

        # Set the seed for reproducibility
        def g():
            return torch.Generator().manual_seed(42)

        accelerator = Accelerator()
        stateful_dl = StatefulDataLoader(dataset, batch_size=4, num_workers=num_workers, generator=g())
        skip_dl = SkipDataLoader(
            dataset, batch_size=4, num_workers=num_workers, generator=g(), use_stateful_dataloader=True
        )
        dl_shard = DataLoaderShard(
            dataset, batch_size=4, num_workers=num_workers, generator=g(), use_stateful_dataloader=True
        )
        dl_dispatcher = DataLoaderDispatcher(
            dataset, batch_size=4, num_workers=num_workers, generator=g(), use_stateful_dataloader=True
        )

        dataloaders_under_test = [skip_dl, dl_shard, dl_dispatcher]

        num_batches_to_skip = 8

        def get_first_n_batches(dl, n, device):
            """
            Iterate over the first `n` batches of a dataloader then break, returning the batches in a list.
            """
            batches = []
            for idx, batch in enumerate(dl):
                if idx == n - 1:
                    if hasattr(dl, "end"):
                        dl.end()
                    break
                batches.append(batch.to(device))
            return batches

        # Iterate over all of the dataloaders identically, expect the same values
        expected_batches = get_first_n_batches(stateful_dl, num_batches_to_skip, accelerator.device)
        batches_from_dataloaders = [
            get_first_n_batches(dl, num_batches_to_skip, accelerator.device) for dl in dataloaders_under_test
        ]

        for dl_batches in batches_from_dataloaders:
            for expected, actual in zip(expected_batches, dl_batches):
                assert torch.allclose(expected, actual)

        # The adapters should all produce the same state_dict as the reference stateful dataloader
        expected_state_dict = stateful_dl.state_dict()
        skip_dl_state_dict = skip_dl.state_dict()
        dl_shard_state_dict = dl_shard.state_dict()
        dl_dispatcher_state_dict = dl_dispatcher.state_dict()

        assert expected_state_dict == skip_dl_state_dict
        assert expected_state_dict == dl_shard_state_dict
        assert expected_state_dict == dl_dispatcher_state_dict

        # Load the state dict into new dataloaders
        manual_skip_dl = SkipDataLoader(
            dataset,
            batch_size=4,
            num_workers=num_workers,
            generator=g(),
            skip_batches=num_batches_to_skip,
            use_stateful_dataloader=True,
        )
        loaded_stateful_dl = StatefulDataLoader(dataset, batch_size=4, num_workers=num_workers, generator=g())
        loaded_stateful_dl.load_state_dict(expected_state_dict)
        loaded_skip_dl = SkipDataLoader(
            dataset, batch_size=4, num_workers=num_workers, generator=g(), use_stateful_dataloader=True
        )
        loaded_skip_dl.load_state_dict(expected_state_dict)
        loaded_dl_shard = DataLoaderShard(
            dataset, batch_size=4, num_workers=num_workers, generator=g(), use_stateful_dataloader=True
        )
        loaded_dl_shard.load_state_dict(expected_state_dict)
        loaded_dl_dispatcher = DataLoaderDispatcher(
            dataset, batch_size=4, num_workers=num_workers, generator=g(), use_stateful_dataloader=True
        )
        loaded_dl_dispatcher.load_state_dict(expected_state_dict)

        # Continue the iteration, expecting identical behavior across the board
        def get_all_batches(dl, device):
            """
            Iterate over all batches of a dataloader, returning (batches, num_batches_yielded)
            """
            batches = []
            num_batches_yielded = 0
            for batch in dl:
                batches.append(batch.to(device))
                num_batches_yielded += 1
            return (batches, num_batches_yielded)

        expected_batch_results = get_all_batches(loaded_stateful_dl, accelerator.device)
        dataloader_batch_results = [
            get_all_batches(dl, accelerator.device)
            for dl in [manual_skip_dl, loaded_skip_dl, loaded_dl_shard, loaded_dl_dispatcher]
        ]
        for dl_results in dataloader_batch_results:
            for expected, actual in zip(expected_batches, dl_batches):
                assert torch.allclose(expected[0], actual[0])
                assert expected_batch_results[1] == dl_results[1]

        assert accelerator.gradient_state.active_dataloader is None

    @parameterized.expand([0, 2], name_func=parameterized_custom_name_func)
    @require_torchdata_stateful_dataloader
    def test_decoupled_stateful_dataloader_adapter_equivalent_to_torchdata_stateful_dataloader(self, num_workers):
        """
        Assert that `state_dict()` and `load_state_dict()` for derived subclasses of `DataLoaderAdapter` produce
        the same behavior as `state_dict()` and `load_state_dict()` for `StatefulDataLoader` when *not* using
        Accelerator (and instead using the decoupled `PartialState` workflow).
        """
        dataset = list(range(64))

        # Set the seed for reproducibility
        def g():
            return torch.Generator().manual_seed(42)

        state = PartialState()
        stateful_dl = StatefulDataLoader(dataset, batch_size=4, num_workers=num_workers, generator=g())
        skip_dl = SkipDataLoader(
            dataset, batch_size=4, num_workers=num_workers, generator=g(), use_stateful_dataloader=True
        )
        dl_shard = DataLoaderShard(
            dataset, batch_size=4, num_workers=num_workers, generator=g(), use_stateful_dataloader=True
        )
        dl_dispatcher = DataLoaderDispatcher(
            dataset, batch_size=4, num_workers=num_workers, generator=g(), use_stateful_dataloader=True
        )

        dataloaders_under_test = [skip_dl, dl_shard, dl_dispatcher]

        num_batches_to_skip = 8

        def get_first_n_batches(dl, n, device):
            """
            Iterate over the first `n` batches of a dataloader then break, returning the batches in a list.
            """
            batches = []
            for idx, batch in enumerate(dl):
                if idx == n - 1:
                    if hasattr(dl, "end"):
                        dl.end()
                    break
                batches.append(batch.to(device))
            return batches

        # Iterate over all of the dataloaders identically, expect the same values
        expected_batches = get_first_n_batches(stateful_dl, num_batches_to_skip, state.device)
        batches_from_dataloaders = [
            get_first_n_batches(dl, num_batches_to_skip, state.device) for dl in dataloaders_under_test
        ]

        for dl_batches in batches_from_dataloaders:
            for expected, actual in zip(expected_batches, dl_batches):
                assert torch.allclose(expected, actual)

        # The adapters should all produce the same state_dict as the reference stateful dataloader
        expected_state_dict = stateful_dl.state_dict()
        skip_dl_state_dict = skip_dl.state_dict()
        dl_shard_state_dict = dl_shard.state_dict()
        dl_dispatcher_state_dict = dl_dispatcher.state_dict()

        assert expected_state_dict == skip_dl_state_dict
        assert expected_state_dict == dl_shard_state_dict
        assert expected_state_dict == dl_dispatcher_state_dict

        # Load the state dict into new dataloaders
        manual_skip_dl = SkipDataLoader(
            dataset,
            batch_size=4,
            num_workers=num_workers,
            generator=g(),
            skip_batches=num_batches_to_skip,
            use_stateful_dataloader=True,
        )
        loaded_stateful_dl = StatefulDataLoader(dataset, batch_size=4, num_workers=num_workers, generator=g())
        loaded_stateful_dl.load_state_dict(expected_state_dict)
        loaded_skip_dl = SkipDataLoader(
            dataset, batch_size=4, num_workers=num_workers, generator=g(), use_stateful_dataloader=True
        )
        loaded_skip_dl.load_state_dict(expected_state_dict)
        loaded_dl_shard = DataLoaderShard(
            dataset, batch_size=4, num_workers=num_workers, generator=g(), use_stateful_dataloader=True
        )
        loaded_dl_shard.load_state_dict(expected_state_dict)
        loaded_dl_dispatcher = DataLoaderDispatcher(
            dataset, batch_size=4, num_workers=num_workers, generator=g(), use_stateful_dataloader=True
        )
        loaded_dl_dispatcher.load_state_dict(expected_state_dict)

        # Continue the iteration, expecting identical behavior across the board
        def get_all_batches(dl, device):
            """
            Iterate over all batches of a dataloader, returning (batches, num_batches_yielded)
            """
            batches = []
            num_batches_yielded = 0
            for batch in dl:
                batches.append(batch.to(device))
                num_batches_yielded += 1
            return (batches, num_batches_yielded)

        expected_batch_results = get_all_batches(loaded_stateful_dl, state.device)
        dataloader_batch_results = [
            get_all_batches(dl, state.device)
            for dl in [manual_skip_dl, loaded_skip_dl, loaded_dl_shard, loaded_dl_dispatcher]
        ]
        for dl_results in dataloader_batch_results:
            for expected, actual in zip(expected_batches, dl_batches):
                assert torch.allclose(expected[0], actual[0])
                assert expected_batch_results[1] == dl_results[1]

        # Using the decoupled (`PartialState`) workflow, GradientState should be automatically initialized (with
        # default parameters) by `DataLoaderDispatcher`
        assert GradientState._shared_state != {}, "GradientState should already be initialized!"

        gradient_state = GradientState()
        assert gradient_state.active_dataloader is None


================================================
FILE: tests/test_dataclasses.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from unittest.mock import Mock, patch

import pytest

from accelerate.parallelism_config import ParallelismConfig
from accelerate.utils import patch_environment
from accelerate.utils.constants import (
    BETA_CP_AVAILABLE_PYTORCH_VERSION,
    BETA_SP_AVAILABLE_DEEPSPEED_VERSION,
    BETA_TP_AVAILABLE_PYTORCH_VERSION,
    BETA_TP_AVAILABLE_TRANSFORMERS_VERSION,
)
from accelerate.utils.imports import is_deepspeed_available, is_transformers_available
from accelerate.utils.versions import compare_versions, is_torch_version


def _should_skip_cp_test(cp_size):
    """Check if CP test should be skipped based on cp_size and torch version."""
    return cp_size > 1 and not is_torch_version(">=", BETA_CP_AVAILABLE_PYTORCH_VERSION)


def _should_skip_sp_test(sp_size):
    """Check if SP test should be skipped based on sp_size and deepspeed version."""
    if sp_size <= 1:
        return False
    if not is_deepspeed_available():
        return True
    return not compare_versions("deepspeed", ">=", BETA_SP_AVAILABLE_DEEPSPEED_VERSION)


def _should_skip_tp_test(tp_size):
    """Check if TP test should be skipped based on tp_size, torch version, and transformers availability."""
    if tp_size <= 1:
        return False

    if not is_torch_version(">=", BETA_TP_AVAILABLE_PYTORCH_VERSION):
        return True

    if not is_transformers_available():
        return True

    if not compare_versions("transformers", ">=", BETA_TP_AVAILABLE_TRANSFORMERS_VERSION):
        return True

    return False


class TestParallelismConfig:
    @pytest.fixture(autouse=True)
    def mock_init_device_mesh(self):
        def mock_init_mesh(device_type, mesh_shape, mesh_dim_names):
            mesh = Mock()
            mesh.size.return_value = 1
            for dim in mesh_shape:
                mesh.size.return_value *= dim
            mesh.shape = mesh_shape
            mesh.mesh_dim_names = mesh_dim_names

            # mock device_mesh._flatten
            mesh.flattened_dims = []

            def mock_getitem(key):
                submesh = Mock()

                def mock_flatten(name):
                    mesh.flattened_dims.append((key, name))

                submesh._flatten = Mock(side_effect=mock_flatten)
                return submesh

            mesh.__getitem__ = Mock(side_effect=mock_getitem)

            return mesh

        with patch("torch.distributed.device_mesh.init_device_mesh", side_effect=mock_init_mesh):
            yield mock_init_mesh

    @pytest.mark.parametrize(
        "dp_replicate_size, dp_shard_size, tp_size, cp_size, expected_shape, expected_dim_names",
        [
            (8, 1, 1, 1, (8,), ("dp_replicate",)),  # DDP
            (1, 8, 1, 1, (8,), ("dp_shard",)),  # FSDP
            (2, 4, 1, 1, (2, 4), ("dp_replicate", "dp_shard")),  # HSDP
            (1, 4, 2, 1, (4, 2), ("dp_shard", "tp")),  # FSDP + TP
            (2, 2, 2, 1, (2, 2, 2), ("dp_replicate", "dp_shard", "tp")),  # HSDP + TP
            (1, 1, 8, 1, (8,), ("tp",)),  # TP only
            (1, 1, 1, 4, (4,), ("cp",)),  # CP only
            (1, 4, 1, 2, (4, 2), ("dp_shard", "cp")),  # FSDP + CP
            (1, 2, 2, 2, (2, 2, 2), ("dp_shard", "cp", "tp")),  # FSDP + CP + TP
            (2, 2, 2, 2, (2, 2, 2, 2), ("dp_replicate", "dp_shard", "cp", "tp")),  # HSDP + CP + TP
        ],
    )
    def test_get_mesh(
        self,
        dp_replicate_size,
        dp_shard_size,
        tp_size,
        cp_size,
        expected_shape,
        expected_dim_names,
    ):
        # Skip tests based on version requirements
        if _should_skip_cp_test(cp_size):
            pytest.skip(f"tests with `cp_size>1` require torch >= {BETA_CP_AVAILABLE_PYTORCH_VERSION}")
        if _should_skip_tp_test(tp_size):
            pytest.skip(
                f"tests with `tp_size>1` require torch >= {BETA_TP_AVAILABLE_PYTORCH_VERSION}, transformers available and >= {BETA_TP_AVAILABLE_TRANSFORMERS_VERSION}"
            )

        config = ParallelismConfig(
            dp_replicate_size=dp_replicate_size, dp_shard_size=dp_shard_size, tp_size=tp_size, cp_size=cp_size
        )
        mesh_dim_names, mesh_shape = config._get_mesh()
        assert mesh_shape == expected_shape
        assert mesh_dim_names == expected_dim_names

    @pytest.mark.parametrize(
        "dp_replicate_size, dp_shard_size, tp_size, cp_size, expected_shape, expected_dim_names",
        [
            (8, 1, 1, 1, (8,), ("dp_replicate",)),
            (1, 8, 1, 1, (8,), ("dp_shard",)),
            (2, 4, 1, 1, (2, 4), ("dp_replicate", "dp_shard")),
            (1, 4, 2, 1, (4, 2), ("dp_shard", "tp")),
            (2, 2, 2, 1, (2, 2, 2), ("dp_replicate", "dp_shard", "tp")),
            (1, 1, 8, 1, (8,), ("tp",)),
            (1, 1, 1, 4, (4,), ("cp",)),
            (1, 4, 1, 2, (4, 2), ("dp_shard", "cp")),
            (1, 2, 2, 2, (2, 2, 2), ("dp_shard", "cp", "tp")),
            (2, 2, 2, 2, (2, 2, 2, 2), ("dp_replicate", "dp_shard", "cp", "tp")),
        ],
    )
    def test_build_device_mesh(
        self,
        dp_replicate_size,
        dp_shard_size,
        tp_size,
        cp_size,
        expected_shape,
        expected_dim_names,
    ):
        """Test build_device_mesh creates correct mesh and applies flattening."""
        # Skip tests based on version requirements
        if _should_skip_cp_test(cp_size):
            pytest.skip(f"tests with `cp_size>1` require torch >= {BETA_CP_AVAILABLE_PYTORCH_VERSION}")
        if _should_skip_tp_test(tp_size):
            pytest.skip(
                f"tests with `tp_size>1` require torch >= {BETA_TP_AVAILABLE_PYTORCH_VERSION}, transformers available and >= {BETA_TP_AVAILABLE_TRANSFORMERS_VERSION}"
            )

        config = ParallelismConfig(
            dp_replicate_size=dp_replicate_size, dp_shard_size=dp_shard_size, tp_size=tp_size, cp_size=cp_size
        )
        device_mesh = config.build_device_mesh("cpu")

        # Check mesh shape and dimension names match expected
        assert device_mesh.shape == expected_shape
        assert device_mesh.mesh_dim_names == expected_dim_names

        # Check that correct flattening operations were called
        expected_flattened = []
        if config.dp_dim_names:
            expected_flattened.append((config.dp_dim_names, "dp"))
        if config.dp_shard_cp_dim_names:
            expected_flattened.append((config.dp_shard_cp_dim_names, "dp_shard_cp"))
        if config.dp_cp_dim_names:
            expected_flattened.append((config.dp_cp_dim_names, "dp_cp"))

        assert device_mesh.flattened_dims == expected_flattened

    @pytest.mark.parametrize(
        "dp_replicate_size, dp_shard_size, tp_size, cp_size",
        [
            (8, 1, 1, 1),
            (1, 8, 1, 1),
            (2, 4, 1, 1),
            (1, 4, 2, 1),
            (2, 2, 2, 1),
            (1, 1, 8, 1),
            (1, 1, 1, 4),
            (1, 4, 1, 2),
            (1, 2, 2, 2),
            (2, 2, 2, 2),
        ],
    )
    def test_from_env(
        self,
        dp_replicate_size,
        dp_shard_size,
        tp_size,
        cp_size,
    ):
        if _should_skip_cp_test(cp_size):
            pytest.skip(f"tests with `cp_size>1` require torch >= {BETA_CP_AVAILABLE_PYTORCH_VERSION}")
        if _should_skip_tp_test(tp_size):
            pytest.skip(
                f"tests with `tp_size>1` require torch >= {BETA_TP_AVAILABLE_PYTORCH_VERSION}, transformers available and >= {BETA_TP_AVAILABLE_TRANSFORMERS_VERSION}"
            )

        new_env = {
            "PARALLELISM_CONFIG_DP_REPLICATE_SIZE": dp_replicate_size,
            "PARALLELISM_CONFIG_DP_SHARD_SIZE": dp_shard_size,
            "PARALLELISM_CONFIG_TP_SIZE": tp_size,
            "PARALLELISM_CONFIG_CP_SIZE": cp_size,
        }

        with patch_environment(**new_env):
            config = ParallelismConfig()
            for key, value in new_env.items():
                assert getattr(config, key.split("PARALLELISM_CONFIG_")[-1].lower()) == value

    def test_cp_torch_handler(self):
        """Test CP Torch/FSDP2 handler with various configurations."""

        # Any cp_size > 1 requires torch >= BETA_CP_AVAILABLE_PYTORCH_VERSION, we use placeholder for this check as this test doesn't depend on a specific size
        if _should_skip_cp_test(2):
            pytest.skip(f"tests with `cp_size>1` require torch >= {BETA_CP_AVAILABLE_PYTORCH_VERSION}")

        from accelerate.utils import TorchContextParallelConfig

        for setting in ("allgather", "alltoall"):
            cp_handler = TorchContextParallelConfig(cp_comm_strategy=setting)
            pc = ParallelismConfig(cp_size=2, cp_handler=cp_handler)

            assert pc.cp_handler is not None, "CP handler should be set"
            assert pc.cp_handler.cp_comm_strategy == setting, (
                f"CP handler strategy should be {setting} but got {pc.cp_handler.cp_comm_strategy}"
            )

        for setting in ("allgather", "alltoall"):
            with patch_environment(PARALLELISM_CONFIG_CP_COMM_STRATEGY=setting):
                pc = ParallelismConfig(cp_size=2)
                assert pc.cp_handler is not None, "CP handler should be set from environment"
                assert pc.cp_handler.cp_comm_strategy == setting, (
                    f"CP handler strategy should be {setting} but got {pc.cp_handler.cp_comm_strategy}"
                )

        for setting in ("invalid", "unsupported"):
            with pytest.raises(ValueError, match=f"Invalid cp_comm_strategy: {setting}"):
                TorchContextParallelConfig(cp_comm_strategy=setting)

            with patch_environment(PARALLELISM_CONFIG_CP_COMM_STRATEGY=setting):
                with pytest.raises(ValueError, match=f"Invalid cp_comm_strategy: {setting}"):
                    pc = ParallelismConfig(cp_size=2)

    def test_sp_deepspeed_handler(self):
        """Test SP DeepSpeed/ALST/UlyssesSP handler with various configurations."""

        # Any sp_size > 1 requires torch >= BETA_SP_AVAILABLE_PYTORCH_VERSION, we use placeholder for this check as this test doesn't depend on a specific size
        if _should_skip_sp_test(2):
            pytest.skip(f"tests with `sp_size>1` require deepspeed >= {BETA_SP_AVAILABLE_DEEPSPEED_VERSION}")

        from accelerate.utils import DeepSpeedSequenceParallelConfig

        sp_handler = DeepSpeedSequenceParallelConfig()
        pc = ParallelismConfig(sp_backend="deepspeed", sp_size=2, sp_handler=sp_handler)
        assert pc.sp_handler is not None, "SP handler should be set"
        assert pc.sp_handler.sp_seq_length_is_variable is True, "by default we set to expect a variable seqlen"

        with pytest.raises(ValueError, match="Invalid sp_attn_implementation"):
            DeepSpeedSequenceParallelConfig(sp_attn_implementation="foobar")

    def test_tp_handler(self):
        assert True, "Tensor parallelism handler doesn't hold any logic yet"


================================================
FILE: tests/test_examples.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import ast
import os
import re
import shutil
import tempfile
import unittest
from pathlib import Path
from typing import Optional
from unittest import mock, skip

import torch

from accelerate.test_utils.examples import compare_against_test
from accelerate.test_utils.testing import (
    TempDirTestCase,
    get_launch_command,
    is_hpu_available,
    is_xpu_available,
    require_fp16,
    require_huggingface_suite,
    require_multi_device,
    require_pippy,
    require_schedulefree,
    require_trackers,
    run_command,
    run_first,
    slow,
)
from accelerate.utils import write_basic_config


# DataLoaders built from `test_samples/MRPC` for quick testing
# Should mock `{script_name}.get_dataloaders` via:
# @mock.patch("{script_name}.get_dataloaders", mocked_dataloaders)

EXCLUDE_EXAMPLES = [
    "cross_validation.py",
    "checkpointing.py",
    "gradient_accumulation.py",
    "local_sgd.py",
    "multi_process_metrics.py",
    "memory.py",
    "schedule_free.py",
    "tracking.py",
    "automatic_gradient_accumulation.py",
    "gradient_accumulation_for_autoregressive_models.py",
    "fsdp_with_peak_mem_tracking.py",
    "deepspeed_with_config_support.py",
    "megatron_lm_gpt_pretraining.py",
    "early_stopping.py",
    "ddp_comm_hook.py",
    "profiler.py",
]


class ExampleDifferenceTests(unittest.TestCase):
    """
    This TestCase checks that all of the `complete_*` scripts contain all of the
    information found in the `by_feature` scripts, line for line. If one fails,
    then a complete example does not contain all of the features in the features
    scripts, and should be updated.

    Each example script should be a single test (such as `test_nlp_example`),
    and should run `one_complete_example` twice: once with `parser_only=True`,
    and the other with `parser_only=False`. This is so that when the test
    failures are returned to the user, they understand if the discrepancy lies in
    the `main` function, or the `training_loop` function. Otherwise it will be
    unclear.

    Also, if there are any expected differences between the base script used and
    `complete_nlp_example.py` (the canonical base script), these should be included in
    `special_strings`. These would be differences in how something is logged, print statements,
    etc (such as calls to `Accelerate.log()`)
    """

    by_feature_path = Path("examples", "by_feature").resolve()
    examples_path = Path("examples").resolve()

    def one_complete_example(
        self,
        complete_file_name: str,
        parser_only: bool,
        secondary_filename: Optional[str] = None,
        special_strings: Optional[list] = None,
    ):
        """
        Tests a single `complete` example against all of the implemented `by_feature` scripts

        Args:
            complete_file_name (`str`):
                The filename of a complete example
            parser_only (`bool`):
                Whether to look at the main training function, or the argument parser
            secondary_filename (`str`, *optional*):
                A potential secondary base file to strip all script information not relevant for checking,
                such as "cv_example.py" when testing "complete_cv_example.py"
            special_strings (`list`, *optional*):
                A list of strings to potentially remove before checking no differences are left. These should be
                diffs that are file specific, such as different logging variations between files.
        """
        self.maxDiff = None
        for item in os.listdir(self.by_feature_path):
            if item not in EXCLUDE_EXAMPLES:
                item_path = self.by_feature_path / item
                if item_path.is_file() and item_path.suffix == ".py":
                    with self.subTest(
                        tested_script=complete_file_name,
                        feature_script=item,
                        tested_section="main()" if parser_only else "training_function()",
                    ):
                        diff = compare_against_test(
                            self.examples_path / complete_file_name, item_path, parser_only, secondary_filename
                        )
                        diff = "\n".join(diff)
                        if special_strings is not None:
                            for string in special_strings:
                                diff = diff.replace(string, "")
                        assert diff == ""

    def test_nlp_examples(self):
        self.one_complete_example("complete_nlp_example.py", True)
        self.one_complete_example("complete_nlp_example.py", False)

    def test_cv_examples(self):
        cv_path = (self.examples_path / "cv_example.py").resolve()
        special_strings = [
            " " * 16 + "{\n\n",
            " " * 20 + '"accuracy": eval_metric["accuracy"],\n\n',
            " " * 20 + '"f1": eval_metric["f1"],\n\n',
            " " * 20 + '"train_loss": total_loss.item() / len(train_dataloader),\n\n',
            " " * 20 + '"epoch": epoch,\n\n',
            " " * 16 + "},\n\n",
            " " * 16 + "step=epoch,\n",
            " " * 12,
            " " * 8 + "for step, batch in enumerate(active_dataloader):\n",
        ]
        self.one_complete_example("complete_cv_example.py", True, cv_path, special_strings)
        self.one_complete_example("complete_cv_example.py", False, cv_path, special_strings)


@mock.patch.dict(os.environ, {"TESTING_MOCKED_DATALOADERS": "1"})
@require_huggingface_suite
@run_first
class FeatureExamplesTests(TempDirTestCase):
    clear_on_setup = False

    @classmethod
    def setUpClass(cls):
        super().setUpClass()
        cls._tmpdir = tempfile.mkdtemp()
        cls.config_file = Path(cls._tmpdir) / "default_config.yml"

        write_basic_config(save_location=cls.config_file)
        cls.launch_args = get_launch_command(config_file=cls.config_file)

    @classmethod
    def tearDownClass(cls):
        super().tearDownClass()
        shutil.rmtree(cls._tmpdir)

    def test_checkpointing_by_epoch(self):
        testargs = f"""
        examples/by_feature/checkpointing.py
        --checkpointing_steps epoch
        --output_dir {self.tmpdir}
        """.split()
        run_command(self.launch_args + testargs)
        assert (self.tmpdir / "epoch_0").exists()

    def test_checkpointing_by_steps(self):
        testargs = f"""
        examples/by_feature/checkpointing.py
        --checkpointing_steps 1
        --output_dir {self.tmpdir}
        """.split()
        _ = run_command(self.launch_args + testargs)
        assert (self.tmpdir / "step_2").exists()

    def test_load_states_by_epoch(self):
        testargs = f"""
        examples/by_feature/checkpointing.py
        --resume_from_checkpoint {self.tmpdir / "epoch_0"}
        """.split()
        output = run_command(self.launch_args + testargs, return_stdout=True)
        assert "epoch 0:" not in output
        assert "epoch 1:" in output

    def test_load_states_by_steps(self):
        testargs = f"""
        examples/by_feature/checkpointing.py
        --resume_from_checkpoint {self.tmpdir / "step_2"}
        """.split()
        output = run_command(self.launch_args + testargs, return_stdout=True)
        if is_hpu_available():
            num_processes = torch.hpu.device_count()
        elif torch.cuda.is_available():
            num_processes = torch.cuda.device_count()
        elif is_xpu_available():
            num_processes = torch.xpu.device_count()
        else:
            num_processes = 1

        if num_processes > 1:
            assert "epoch 0:" not in output
            assert "epoch 1:" in output
        else:
            assert "epoch 0:" in output
            assert "epoch 1:" in output

    @slow
    def test_cross_validation(self):
        testargs = """
        examples/by_feature/cross_validation.py
        --num_folds 2
        """.split()
        with mock.patch.dict(os.environ, {"TESTING_MOCKED_DATALOADERS": "0"}):
            output = run_command(self.launch_args + testargs, return_stdout=True)
            results = re.findall("({.+})", output)
            results = [r for r in results if "accuracy" in r][-1]
            results = ast.literal_eval(results)
            assert results["accuracy"] >= 0.75

    def test_multi_process_metrics(self):
        testargs = ["examples/by_feature/multi_process_metrics.py"]
        run_command(self.launch_args + testargs)

    @require_schedulefree
    def test_schedulefree(self):
        testargs = ["examples/by_feature/schedule_free.py"]
        run_command(self.launch_args + testargs)

    @require_trackers
    @mock.patch.dict(
        os.environ,
        {"WANDB_MODE": "offline", "DVCLIVE_TEST": "true", "SWANLAB_MODE": "disabled"},
    )
    def test_tracking(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            testargs = f"""
            examples/by_feature/tracking.py
            --with_tracking
            --project_dir {tmpdir}
            """.split()
            run_command(self.launch_args + testargs)

    def test_gradient_accumulation(self):
        testargs = ["examples/by_feature/gradient_accumulation.py"]
        run_command(self.launch_args + testargs)

    def test_gradient_accumulation_for_autoregressive_models(self):
        testargs = [
            "examples/by_feature/gradient_accumulation_for_autoregressive_models.py",
            "--gradient_accumulation_steps",
            "2",
        ]
        run_command(self.launch_args + testargs)

    def test_local_sgd(self):
        testargs = ["examples/by_feature/local_sgd.py"]
        run_command(self.launch_args + testargs)

    def test_early_stopping(self):
        testargs = ["examples/by_feature/early_stopping.py"]
        run_command(self.launch_args + testargs)

    def test_profiler(self):
        testargs = ["examples/by_feature/profiler.py"]
        run_command(self.launch_args + testargs)

    @require_fp16
    @require_multi_device
    def test_ddp_comm_hook(self):
        testargs = ["examples/by_feature/ddp_comm_hook.py", "--ddp_comm_hook", "fp16"]
        run_command(self.launch_args + testargs)

    @require_fp16
    @require_multi_device
    def test_distributed_inference_examples_stable_diffusion(self):
        testargs = ["examples/inference/distributed/stable_diffusion.py"]
        run_command(self.launch_args + testargs)

    @require_fp16
    @require_multi_device
    def test_distributed_inference_examples_phi2(self):
        testargs = ["examples/inference/distributed/phi2.py"]
        run_command(self.launch_args + testargs)

    @require_pippy
    @require_multi_device
    @skip("Will soon deprecate pippy")
    def test_pippy_examples_bert(self):
        testargs = ["examples/inference/pippy/bert.py"]
        run_command(self.launch_args + testargs)

    @require_pippy
    @require_multi_device
    @skip("Will soon deprecate pippy")
    def test_pippy_examples_gpt2(self):
        testargs = ["examples/inference/pippy/gpt2.py"]
        run_command(self.launch_args + testargs)


================================================
FILE: tests/test_fp8.py
================================================
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import json
import os
import tempfile
import textwrap
import unittest
from pathlib import Path

import torch

from accelerate import Accelerator
from accelerate.state import AcceleratorState
from accelerate.test_utils import (
    get_launch_command,
    require_cuda_or_hpu,
    require_huggingface_suite,
    require_multi_device,
    require_torchao,
    require_transformer_engine,
    require_transformer_engine_mxfp8,
    run_first,
)
from accelerate.test_utils.testing import require_deepspeed, run_command
from accelerate.utils import (
    AORecipeKwargs,
    TERecipeKwargs,
    has_ao_layers,
    has_transformer_engine_layers,
)


def can_convert_te_model(from_config=False):
    if not from_config:
        accelerator_kwargs = {"mixed_precision": "fp8", "kwargs_handlers": [TERecipeKwargs()]}
    else:
        accelerator_kwargs = {}

    accelerator = Accelerator(**accelerator_kwargs)
    assert accelerator.fp8_enabled, "FP8 is not enabled"

    dataloader = torch.utils.data.DataLoader(torch.randn(10, 32), batch_size=2)
    model = torch.nn.Sequential(torch.nn.Linear(32, 32), torch.nn.LayerNorm(32, bias=False), torch.nn.Linear(32, 16))
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

    model, optimizer, dataloader, scheduler = accelerator.prepare(model, optimizer, dataloader, scheduler)
    assert has_transformer_engine_layers(model)


def maintain_proper_deepspeed_config(expected_version):
    assert AcceleratorState().deepspeed_plugin.zero_stage == expected_version, (
        f"Expected zero stage {expected_version} but got {AcceleratorState().deepspeed_plugin.zero_stage}"
    )


def can_convert_ao_model(from_config=False):
    from transformers import AutoModelForSequenceClassification

    if not from_config:
        accelerator_kwargs = {"mixed_precision": "fp8", "kwargs_handlers": [AORecipeKwargs()]}
    else:
        accelerator_kwargs = {}

    accelerator = Accelerator(**accelerator_kwargs)
    dataloader = torch.utils.data.DataLoader(torch.randn(10, 32), batch_size=2)
    model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased")
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

    model, optimizer, dataloader, scheduler = accelerator.prepare(model, optimizer, dataloader, scheduler)
    assert has_ao_layers(model)


@run_first
@require_transformer_engine
@require_cuda_or_hpu
class TestTransformerEngine(unittest.TestCase):
    def test_can_prepare_model_single_gpu(self):
        command = get_launch_command(num_processes=1, monitor_interval=0.1)
        command += ["-m", "tests.test_fp8", "--test_te"]
        run_command(command)

    def test_can_prepare_model_single_gpu_from_config(self):
        with tempfile.TemporaryDirectory() as dir_name:
            config_file = Path(dir_name) / "config.yaml"
            config_file.write_text(
                textwrap.dedent(
                    """
                    distributed_type: "NO"
                    num_processes: 1
                    mixed_precision: fp8
                    fp8_config:
                      backend: TE
                    """
                )
            )
            command = get_launch_command(config_file=str(config_file), monitor_interval=0.1)
            command += ["-m", "tests.test_fp8", "--test_te", "--from_config"]
            run_command(command)

    @require_transformer_engine_mxfp8
    def test_can_prepare_model_with_mxfp8_block_scaling(self):
        with tempfile.TemporaryDirectory() as dir_name:
            config_file = Path(dir_name) / "config.yaml"
            config_file.write_text(
                textwrap.dedent(
                    """
                    distributed_type: "NO"
                    num_processes: 1
                    mixed_precision: fp8
                    fp8_config:
                      backend: TE
                      use_mxfp8_block_scaling: true
                    """
                )
            )
            command = get_launch_command(config_file=str(config_file), monitor_interval=0.1)
            command += ["-m", "tests.test_fp8", "--test_te", "--from_config"]
            run_command(command)

    @require_multi_device
    def test_can_prepare_model_multi_gpu(self):
        command = get_launch_command(num_processes=2, monitor_interval=0.1)
        command += ["-m", "tests.test_fp8", "--test_te"]
        run_command(command)

    @require_deepspeed
    @require_multi_device
    def test_can_prepare_model_multigpu_deepspeed(self):
        for zero_stage in [1, 2, 3]:
            os.environ["ZERO_STAGE"] = str(zero_stage)
            ds_config = {
                "bf16": {"enabled": True},
                "zero_optimization": {
                    "stage": zero_stage,
                    "allgather_partitions": True,
                    "allgather_bucket_size": 2e8,
                    "overlap_comm": True,
                    "reduce_scatter": True,
                    "reduce_bucket_size": 2e8,
                    "contiguous_gradients": True,
                },
                "gradient_accumulation_steps": 1,
                "gradient_clipping": "auto",
                "steps_per_print": 2000,
                "train_batch_size": "auto",
                "train_micro_batch_size_per_gpu": "auto",
                "wall_clock_breakdown": False,
            }

            ds_config = json.dumps(ds_config)

            command = get_launch_command(
                num_processes=2, monitor_interval=0.1, use_deepspeed=True, deepspeed_config_file=ds_config
            )
            command += ["-m", "tests.test_fp8", "--test_te"]
            run_command(command)

    @require_deepspeed
    @require_multi_device
    def test_can_prepare_model_multigpu_deepspeed_from_config(self):
        os.environ["ZERO_STAGE"] = str(1)
        with tempfile.TemporaryDirectory() as dir_name:
            config_file = Path(dir_name) / "config.yaml"
            config_file.write_text(
                textwrap.dedent(
                    """
                    distributed_type: "DEEPSPEED"
                    deepspeed_config:
                      gradient_clipping: 1.0
                      gradient_accumulation_steps: 1
                      offload_optimizer_device: none
                      offload_param_device: none
                      zero3_init_flag: false
                      zero_stage: 1
                      deepspeed_multinode_launcher: standard
                    num_processes: 2
                    mixed_precision: fp8
                    fp8_config:
                      backend: TE
                    """
                )
            )
            command = get_launch_command(config_file=str(config_file), monitor_interval=0.1)
            command += ["-m", "tests.test_fp8", "--test_te", "--from_config"]
            run_command(command)


@require_torchao
@require_huggingface_suite
class TestTorchAO(unittest.TestCase):
    def test_can_prepare_model_single_accelerator(self):
        command = get_launch_command(num_processes=1, monitor_interval=0.1)
        command += ["-m", "tests.test_fp8", "--test_ao"]
        run_command(command)

    def test_can_prepare_model_single_gpu_from_config(self):
        with tempfile.TemporaryDirectory() as dir_name:
            config_file = Path(dir_name) / "config.yaml"
            config_file.write_text(
                textwrap.dedent(
                    """
                    distributed_type: "NO"
                    num_processes: 1
                    mixed_precision: fp8
                    fp8_config:
                      backend: AO
                    """
                )
            )
            command = get_launch_command(config_file=str(config_file), monitor_interval=0.1)
            command += ["-m", "tests.test_fp8", "--test_ao", "--from_config"]
            run_command(command)

    def test_can_prepare_model_single_gpu_from_config_with_additional_params(self):
        with tempfile.TemporaryDirectory() as dir_name:
            config_file = Path(dir_name) / "config.yaml"
            config_file.write_text(
                textwrap.dedent(
                    """
                    distributed_type: "NO"
                    num_processes: 1
                    mixed_precision: fp8
                    fp8_config:
                      backend: AO
                      pad_inner_dim: true
                      enable_fsdp_float8_all_gather: false
                    """
                )
            )
            command = get_launch_command(config_file=str(config_file), monitor_interval=0.1)
            command += ["-m", "tests.test_fp8", "--test_ao", "--from_config"]
            run_command(command)

    @require_multi_device
    def test_can_prepare_model_multi_accelerator(self):
        command = get_launch_command(num_processes=2, monitor_interval=0.1)
        command += ["-m", "tests.test_fp8", "--test_ao"]
        run_command(command)

    @require_deepspeed
    @require_multi_device
    def test_can_prepare_model_multi_accelerator_deepspeed(self):
        for zero_stage in [1, 2, 3]:
            os.environ["ZERO_STAGE"] = str(zero_stage)
            ds_config = {
                "bf16": {"enabled": True},
                "zero_optimization": {
                    "stage": zero_stage,
                    "allgather_partitions": True,
                    "allgather_bucket_size": 2e8,
                    "overlap_comm": True,
                    "reduce_scatter": True,
                    "reduce_bucket_size": 2e8,
                    "contiguous_gradients": True,
                },
                "gradient_accumulation_steps": 1,
                "gradient_clipping": "auto",
                "steps_per_print": 2000,
                "train_batch_size": "auto",
                "train_micro_batch_size_per_gpu": "auto",
                "wall_clock_breakdown": False,
            }

            ds_config = json.dumps(ds_config)

            command = get_launch_command(
                num_processes=2, monitor_interval=0.1, use_deepspeed=True, deepspeed_config_file=ds_config
            )
            command += ["-m", "tests.test_fp8", "--test_ao"]
            run_command(command)


if __name__ == "__main__":
    # TE suite
    parser = argparse.ArgumentParser()
    parser.add_argument("--test_te", action="store_true", default=False)
    parser.add_argument("--test_ao", action="store_true", default=False)
    parser.add_argument("--from_config", action="store_true", default=False)
    args = parser.parse_args()

    if not args.test_te and not args.test_ao:
        raise ValueError("Must specify at least one of --test_te or --test_ao")

    if args.test_te:
        can_convert_te_model(args.from_config)
        if os.environ.get("ACCELERATE_USE_DEEPSPEED", "false") == "true":
            maintain_proper_deepspeed_config(int(os.environ.get("ZERO_STAGE")))

    # AO suite
    if args.test_ao:
        can_convert_ao_model(args.from_config)


================================================
FILE: tests/test_grad_sync.py
================================================
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from accelerate import debug_launcher
from accelerate.test_utils import (
    DEFAULT_LAUNCH_COMMAND,
    device_count,
    execute_subprocess_async,
    path_in_accelerate_package,
    require_cpu,
    require_multi_device,
    require_non_cpu,
    run_first,
    test_sync,
)
from accelerate.test_utils.testing import AccelerateTestCase
from accelerate.utils import patch_environment


class SyncScheduler(AccelerateTestCase):
    test_file_path = path_in_accelerate_package("test_utils", "scripts", "test_sync.py")

    @require_cpu
    def test_gradient_sync_cpu_noop(self):
        debug_launcher(test_sync.main, num_processes=1)

    @require_cpu
    def test_gradient_sync_cpu_multi(self):
        debug_launcher(test_sync.main)

    @require_non_cpu
    def test_gradient_sync_gpu(self):
        test_sync.main()

    @run_first
    @require_multi_device
    def test_gradient_sync_gpu_multi(self):
        print(f"Found {device_count} devices.")
        cmd = DEFAULT_LAUNCH_COMMAND + [self.test_file_path]
        with patch_environment(omp_num_threads=1):
            execute_subprocess_async(cmd)


================================================
FILE: tests/test_hooks.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import inspect
import re
import unittest

import torch
import torch.nn as nn
from parameterized import parameterized
from torch.fx import symbolic_trace

from accelerate.big_modeling import attach_layerwise_casting_hooks
from accelerate.hooks import (
    AlignDevicesHook,
    CpuOffload,
    ModelHook,
    SequentialHook,
    UserCpuOffloadHook,
    add_hook_to_module,
    attach_align_device_hook,
    remove_hook_from_module,
    remove_hook_from_submodules,
)
from accelerate.test_utils import require_multi_device, require_non_hpu, torch_device
from accelerate.utils import is_xpu_available
from accelerate.utils.constants import SUPPORTED_PYTORCH_LAYERS_FOR_UPCASTING


torch_device = f"{torch_device}:0" if torch_device != "cpu" else "cpu"


class ModelForTest(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = nn.Linear(3, 4)
        self.batchnorm = nn.BatchNorm1d(4)
        self.linear2 = nn.Linear(4, 5)

    def forward(self, x):
        return self.linear2(self.batchnorm(self.linear1(x)))


class PreForwardHook(ModelHook):
    def pre_forward(self, module, *args, **kwargs):
        return (args[0] + 1,) + args[1:], kwargs


class PostForwardHook(ModelHook):
    def post_forward(self, module, output):
        return output + 1


class HooksModelTester(unittest.TestCase):
    def check_dtype_for_layerwise_upcasting(
        self,
        module,
        storage_dtype,
        loading_type,
        patterns_to_check=None,
    ):
        for name, submodule in module.named_modules():
            attrs = []
            if getattr(submodule, "weight", None) is not None:
                attrs.append(("weight", submodule.weight))
            if getattr(submodule, "bias", None) is not None:
                attrs.append(("bias", submodule.bias))

            if not isinstance(submodule, SUPPORTED_PYTORCH_LAYERS_FOR_UPCASTING):
                if patterns_to_check is None:
                    for _, tensor in attrs:
                        self.assertEqual(tensor.dtype, loading_type)
                continue

            if patterns_to_check and any(re.search(pat, name) for pat in patterns_to_check):
                expected = loading_type
            else:
                expected = storage_dtype

            for _, tensor in attrs:
                self.assertEqual(tensor.dtype, expected)

    def test_add_and_remove_hooks(self):
        test_model = ModelForTest()
        test_hook = ModelHook()

        add_hook_to_module(test_model, test_hook)
        assert test_model._hf_hook == test_hook
        assert hasattr(test_model, "_old_forward")

        # Check adding the hook did not change the name or the signature
        assert test_model.forward.__name__ == "forward"
        assert list(inspect.signature(test_model.forward).parameters) == ["x"]

        remove_hook_from_module(test_model)
        assert not hasattr(test_model, "_hf_hook")
        assert not hasattr(test_model, "_old_forward")

    def test_append_and_remove_hooks(self):
        test_model = ModelForTest()
        test_hook = ModelHook()

        add_hook_to_module(test_model, test_hook)
        add_hook_to_module(test_model, test_hook, append=True)

        assert isinstance(test_model._hf_hook, SequentialHook) is True
        assert len(test_model._hf_hook.hooks) == 2
        assert hasattr(test_model, "_old_forward")

        # Check adding the hook did not change the name or the signature
        assert test_model.forward.__name__ == "forward"
        assert list(inspect.signature(test_model.forward).parameters) == ["x"]

        remove_hook_from_module(test_model)
        assert not hasattr(test_model, "_hf_hook")
        assert not hasattr(test_model, "_old_forward")

    def test_pre_forward_hook_is_executed(self):
        test_model = ModelForTest()
        x = torch.randn(2, 3)
        expected = test_model(x + 1)
        expected2 = test_model(x + 2)

        test_hook = PreForwardHook()
        add_hook_to_module(test_model, test_hook)
        output1 = test_model(x)
        assert torch.allclose(output1, expected, atol=1e-5)

        # Attaching a hook to a model when it already has one replaces, does not chain
        test_hook = PreForwardHook()
        add_hook_to_module(test_model, test_hook)
        output1 = test_model(x)
        assert torch.allclose(output1, expected, atol=1e-5)

        # You need to use the sequential hook to chain two or more hooks
        test_hook = SequentialHook(PreForwardHook(), PreForwardHook())
        add_hook_to_module(test_model, test_hook)

        output2 = test_model(x)
        assert torch.allclose(output2, expected2, atol=1e-5)

    def test_post_forward_hook_is_executed(self):
        test_model = ModelForTest()
        x = torch.randn(2, 3)
        output = test_model(x)

        test_hook = PostForwardHook()
        add_hook_to_module(test_model, test_hook)
        output1 = test_model(x)
        assert torch.allclose(output1, (output + 1), atol=1e-5)

        # Attaching a hook to a model when it already has one replaces, does not chain
        test_hook = PostForwardHook()
        add_hook_to_module(test_model, test_hook)
        output1 = test_model(x)
        assert torch.allclose(output1, (output + 1), atol=1e-5)

        # You need to use the sequential hook to chain two or more hooks
        test_hook = SequentialHook(PostForwardHook(), PostForwardHook())
        add_hook_to_module(test_model, test_hook)

        output2 = test_model(x)
        assert torch.allclose(output2, output + 2, atol=1e-5)

    def test_no_grad_in_hook(self):
        test_model = ModelForTest()
        x = torch.randn(2, 3)
        output = test_model(x)

        test_hook = PostForwardHook()
        add_hook_to_module(test_model, test_hook)
        output1 = test_model(x)
        assert torch.allclose(output1, (output + 1))
        assert output1.requires_grad

        test_hook.no_grad = True
        output1 = test_model(x)
        assert not output1.requires_grad

    @require_non_hpu  # hpu does not support device indexing "hpu:1"
    @require_multi_device
    def test_align_devices_as_model_parallelism(self):
        model = ModelForTest()
        # Everything is on CPU
        assert model.linear1.weight.device == torch.device("cpu")
        assert model.batchnorm.weight.device == torch.device("cpu")
        assert model.linear2.weight.device == torch.device("cpu")

        # This will move each submodule on different devices
        add_hook_to_module(model.linear1, AlignDevicesHook(execution_device=0))
        add_hook_to_module(model.batchnorm, AlignDevicesHook(execution_device=0))
        add_hook_to_module(model.linear2, AlignDevicesHook(execution_device=1))

        assert model.linear1.weight.device == torch.device(torch_device)
        assert model.batchnorm.weight.device == torch.device(torch_device)
        assert model.batchnorm.running_mean.device == torch.device(torch_device)
        assert model.linear2.weight.device == torch.device(torch_device.replace(":0", ":1"))

        # We can still make a forward pass. The input does not need to be on any particular device
        x = torch.randn(2, 3)
        output = model(x)
        assert output.device == torch.device(torch_device.replace(":0", ":1"))

        # We can add a general hook to put back output on same device as input.
        add_hook_to_module(model, AlignDevicesHook(io_same_device=True))
        x = torch.randn(2, 3).to(torch_device)
        output = model(x)
        assert output.device == torch.device(torch_device)

    def test_align_devices_as_cpu_offload(self):
        model = ModelForTest()

        # Everything is on CPU
        assert model.linear1.weight.device == torch.device("cpu")
        assert model.batchnorm.weight.device == torch.device("cpu")
        assert model.linear2.weight.device == torch.device("cpu")

        # This will move each submodule on different devices
        hook_kwargs = {"execution_device": torch_device, "offload": True}

        add_hook_to_module(model.linear1, AlignDevicesHook(**hook_kwargs))
        add_hook_to_module(model.batchnorm, AlignDevicesHook(**hook_kwargs))
        add_hook_to_module(model.linear2, AlignDevicesHook(**hook_kwargs))

        # Parameters have been offloaded, so on the meta device
        assert model.linear1.weight.device == torch.device("meta")
        assert model.batchnorm.weight.device == torch.device("meta")
        assert model.linear2.weight.device == torch.device("meta")
        # Buffers are not included in the offload by default, so are on the execution device
        device = torch.device(hook_kwargs["execution_device"])
        assert model.batchnorm.running_mean.device == device

        x = torch.randn(2, 3)
        output = model(x)
        assert output.device == device

        # Removing hooks loads back the weights in the model.
        remove_hook_from_module(model.linear1)
        remove_hook_from_module(model.batchnorm)
        remove_hook_from_module(model.linear2)
        assert model.linear1.weight.device == torch.device("cpu")
        assert model.batchnorm.weight.device == torch.device("cpu")
        assert model.linear2.weight.device == torch.device("cpu")

        # Now test with buffers included in the offload
        hook_kwargs = {
            "execution_device": torch_device,
            "offload": True,
            "offload_buffers": True,
        }

        add_hook_to_module(model.linear1, AlignDevicesHook(**hook_kwargs))
        add_hook_to_module(model.batchnorm, AlignDevicesHook(**hook_kwargs))
        add_hook_to_module(model.linear2, AlignDevicesHook(**hook_kwargs))

        # Parameters have been offloaded, so on the meta device, buffers included
        assert model.linear1.weight.device == torch.device("meta")
        assert model.batchnorm.weight.device == torch.device("meta")
        assert model.linear2.weight.device == torch.device("meta")
        assert model.batchnorm.running_mean.device == torch.device("meta")

        x = torch.randn(2, 3)
        output = model(x)
        assert output.device == device

        # Removing hooks loads back the weights in the model.
        remove_hook_from_module(model.linear1)
        remove_hook_from_module(model.batchnorm)
        remove_hook_from_module(model.linear2)
        assert model.linear1.weight.device == torch.device("cpu")
        assert model.batchnorm.weight.device == torch.device("cpu")
        assert model.linear2.weight.device == torch.device("cpu")

    def test_attach_align_device_hook_as_cpu_offload(self):
        model = ModelForTest()

        # Everything is on CPU
        assert model.linear1.weight.device == torch.device("cpu")
        assert model.batchnorm.weight.device == torch.device("cpu")
        assert model.linear2.weight.device == torch.device("cpu")

        # This will move each submodule on different devices
        execution_device = torch_device
        attach_align_device_hook(model, execution_device=execution_device, offload=True)

        # Parameters have been offloaded, so on the meta device
        assert model.linear1.weight.device == torch.device("meta")
        assert model.batchnorm.weight.device == torch.device("meta")
        assert model.linear2.weight.device == torch.device("meta")
        # Buffers are not included in the offload by default, so are on the execution device
        device = torch.device(execution_device)
        assert model.batchnorm.running_mean.device == device

        x = torch.randn(2, 3)
        output = model(x)
        assert output.device == device

        # Removing hooks loads back the weights in the model.
        remove_hook_from_submodules(model)
        assert model.linear1.weight.device == torch.device("cpu")
        assert model.batchnorm.weight.device == torch.device("cpu")
        assert model.linear2.weight.device == torch.device("cpu")

        # Now test with buffers included in the offload
        attach_align_device_hook(model, execution_device=execution_device, offload=True, offload_buffers=True)

        # Parameters have been offloaded, so on the meta device, buffers included
        assert model.linear1.weight.device == torch.device("meta")
        assert model.batchnorm.weight.device == torch.device("meta")
        assert model.linear2.weight.device == torch.device("meta")
        assert model.batchnorm.running_mean.device == torch.device("meta")

        x = torch.randn(2, 3)
        output = model(x)
        assert output.device == device

        # Removing hooks loads back the weights in the model.
        remove_hook_from_submodules(model)
        assert model.linear1.weight.device == torch.device("cpu")
        assert model.batchnorm.weight.device == torch.device("cpu")
        assert model.linear2.weight.device == torch.device("cpu")

    def test_attach_align_device_hook_as_cpu_offload_with_weight_map(self):
        model = ModelForTest()

        # Everything is on CPU
        assert model.linear1.weight.device == torch.device("cpu")
        assert model.batchnorm.weight.device == torch.device("cpu")
        assert model.linear2.weight.device == torch.device("cpu")

        # This will move each submodule on different devices
        execution_device = torch_device
        attach_align_device_hook(
            model, execution_device=execution_device, offload=True, weights_map=model.state_dict()
        )

        # Parameters have been offloaded, so on the meta device
        assert model.linear1.weight.device == torch.device("meta")
        assert model.batchnorm.weight.device == torch.device("meta")
        assert model.linear2.weight.device == torch.device("meta")
        # Buffers are not included in the offload by default, so are on the execution device
        device = torch.device(execution_device)
        assert model.batchnorm.running_mean.device == device

        x = torch.randn(2, 3)
        output = model(x)
        assert output.device == device

        # Removing hooks loads back the weights in the model.
        remove_hook_from_submodules(model)
        assert model.linear1.weight.device == torch.device("cpu")
        assert model.batchnorm.weight.device == torch.device("cpu")
        assert model.linear2.weight.device == torch.device("cpu")

        # Now test with buffers included in the offload
        attach_align_device_hook(
            model,
            execution_device=execution_device,
            offload=True,
            weights_map=model.state_dict(),
            offload_buffers=True,
        )

        # Parameters have been offloaded, so on the meta device, buffers included
        assert model.linear1.weight.device == torch.device("meta")
        assert model.batchnorm.weight.device == torch.device("meta")
        assert model.linear2.weight.device == torch.device("meta")
        assert model.batchnorm.running_mean.device == torch.device("meta")

        x = torch.randn(2, 3)
        output = model(x)
        assert output.device == device

        # Removing hooks loads back the weights in the model.
        remove_hook_from_submodules(model)
        assert model.linear1.weight.device == torch.device("cpu")
        assert model.batchnorm.weight.device == torch.device("cpu")
        assert model.linear2.weight.device == torch.device("cpu")

    def test_add_remove_hook_fx_graph_module(self):
        with torch.no_grad():
            test_model = ModelForTest()
            test_hook = ModelHook()

            x = torch.randn(2, 3)
            output1 = test_model(x)

            graph_model = symbolic_trace(test_model)

            output2 = graph_model(x)

            assert torch.allclose(output1, output2)

            add_hook_to_module(graph_model, test_hook)
            remove_hook_from_module(graph_model, recurse=True)

            # We want to make sure that `add_hook_to_module` and `remove_hook_from_module` yields back an fx.GraphModule
            # that behaves correctly (for example that is not frozen, see https://github.com/huggingface/accelerate/pull/2369).
            # For that, we add a sigmoid node to the FX graph and make sure that the new output (output3 below) is different than
            # the original model's output.
            linear2_node = None
            for node in graph_model.graph.nodes:
                if node.name == "linear2":
                    linear2_node = node
            assert linear2_node is not None

            graph_model.graph.inserting_after(linear2_node)
            new_node = graph_model.graph.create_node(
                op="call_function", target=torch.sigmoid, args=(linear2_node,), name="relu"
            )

            output_node = None
            for node in graph_model.graph.nodes:
                if node.name == "output":
                    output_node = node
            assert output_node is not None

            output_node.replace_input_with(linear2_node, new_node)

            graph_model.graph.lint()
            graph_model.recompile()

            output3 = graph_model(x)

            # Now the output is expected to be different since we modified the graph.
            assert not torch.allclose(output1, output3)

    @parameterized.expand(
        [
            (torch.float16, torch.float32),
            (torch.float8_e4m3fn, torch.float32),
            (torch.float8_e4m3fn, torch.float32, ["batchnorm"]),
        ]
    )
    def test_layerwise_upcasting_inference(self, storage_dtype, compute_dtype, skip_modules_pattern=None):
        test_model = ModelForTest()
        loading_dtype = next(test_model.parameters()).data.dtype
        inputs = torch.randn(2, 3)
        inputs = inputs.to(compute_dtype) if inputs.dtype == torch.float32 else inputs

        attach_layerwise_casting_hooks(
            test_model,
            storage_dtype=storage_dtype,
            compute_dtype=compute_dtype,
            skip_modules_pattern=skip_modules_pattern,
        )
        patterns_to_check = skip_modules_pattern if skip_modules_pattern else None
        self.check_dtype_for_layerwise_upcasting(test_model, storage_dtype, loading_dtype, patterns_to_check)

        with torch.no_grad():
            _ = test_model(inputs)

    def test_cpu_offload_hook_moves_model(self):
        if not torch.cuda.is_available() and not is_xpu_available():
            self.skipTest("CUDA or XPU not available for offload test.")

        model = ModelForTest()
        device = torch.device(torch_device)
        hook = CpuOffload(execution_device=device)
        add_hook_to_module(model, hook)

        x = torch.randn(2, 3).to(device)
        output = model(x)
        self.assertEqual(output.device, device)

        remove_hook_from_module(model)
        output2 = model(x)
        self.assertEqual(output2.device, device)

        # should be on the device
        assert model.linear1.weight.device == device
        assert model.batchnorm.weight.device == device
        assert model.linear2.weight.device == device

    def test_cpu_offload_hook_with_prev_module(self):
        if not torch.cuda.is_available() and not is_xpu_available():
            self.skipTest("CUDA or XPU not available for offload test.")

        model1 = ModelForTest()
        model2 = ModelForTest()
        device = torch.device(torch_device)
        cpu_device = torch.device("cpu")

        hook1 = CpuOffload(execution_device=device)
        add_hook_to_module(model1, hook1)
        user_hook1 = UserCpuOffloadHook(model1, hook1)

        hook2 = CpuOffload(execution_device=device, prev_module_hook=user_hook1)
        add_hook_to_module(model2, hook2)

        x = torch.randn(2, 3).to(device)
        output1 = model1(x)
        self.assertEqual(output1.device, device)

        output2 = model2(x)
        self.assertEqual(output2.device, device)

        # should be on the cpu
        assert model1.linear1.weight.device == cpu_device
        assert model1.batchnorm.weight.device == cpu_device
        assert model1.linear2.weight.device == cpu_device

        # should be on the device still
        assert model2.linear1.weight.device == device
        assert model2.batchnorm.weight.device == device
        assert model2.linear2.weight.device == device


================================================
FILE: tests/test_imports.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import subprocess
import sys

from accelerate.test_utils import require_transformer_engine
from accelerate.test_utils.testing import TempDirTestCase, require_import_timer
from accelerate.utils import is_import_timer_available


if is_import_timer_available():
    from import_timer import calculate_total_time, read_import_profile
    from import_timer.core import get_paths_above_threshold, sort_nodes_by_total_time


def convert_list_to_string(data):
    end_result = ""
    arrow_right = "->"
    for path in data:
        end_result += f"{arrow_right.join(path[0])} {path[1]:.3f}s\n"
    return end_result


def run_import_time(command: str):
    output = subprocess.run([sys.executable, "-X", "importtime", "-c", command], capture_output=True, text=True)
    return output.stderr


@require_import_timer
class ImportSpeedTester(TempDirTestCase):
    """
    Test suite which checks if imports have seen slowdowns
    based on a particular baseline.

    If the error messages are not clear enough to get a
    full view of what is slowing things down (or to
    figure out how deep the initial depth should be),
    please view the profile with the `tuna` framework:
    `tuna import.log`.
    """

    clear_on_setup = False

    @classmethod
    def setUpClass(cls):
        super().setUpClass()
        output = run_import_time("import torch")
        data = read_import_profile(output)
        total_time = calculate_total_time(data)
        cls.pytorch_time = total_time

    def test_base_import(self):
        output = run_import_time("import accelerate")
        data = read_import_profile(output)
        total_time = calculate_total_time(data)
        pct_more = (total_time - self.pytorch_time) / self.pytorch_time * 100
        # Base import should never be more than 20% slower than raw torch import
        err_msg = f"Base import is more than 20% slower than raw torch import ({pct_more:.2f}%), please check the attached `tuna` profile:\n"
        sorted_data = sort_nodes_by_total_time(data)
        paths_above_threshold = get_paths_above_threshold(sorted_data, 0.05, max_depth=7)
        err_msg += f"\n{convert_list_to_string(paths_above_threshold)}"
        self.assertLess(pct_more, 20, err_msg)

    def test_cli_import(self):
        output = run_import_time("from accelerate.commands.launch import launch_command_parser")
        data = read_import_profile(output)
        total_time = calculate_total_time(data)
        pct_more = (total_time - self.pytorch_time) / self.pytorch_time * 100
        # Base import should never be more than 20% slower than raw torch import
        err_msg = f"Base import is more than 20% slower than raw torch import ({pct_more:.2f}%), please check the attached `tuna` profile:\n"
        sorted_data = sort_nodes_by_total_time(data)
        paths_above_threshold = get_paths_above_threshold(sorted_data, 0.05, max_depth=7)
        err_msg += f"\n{convert_list_to_string(paths_above_threshold)}"
        self.assertLess(pct_more, 20, err_msg)


@require_transformer_engine
class LazyImportTester(TempDirTestCase):
    """
    Test suite which checks if specific packages are lazy-loaded.

    Eager-import will trigger circular import in some case,
    e.g. in huggingface/accelerate#3056.
    """

    def test_te_import(self):
        output = run_import_time("import accelerate, accelerate.utils.transformer_engine")

        self.assertFalse(" transformer_engine" in output, "`transformer_engine` should not be imported on import")


================================================
FILE: tests/test_kwargs_handlers.py
================================================
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import inspect
import os
from dataclasses import dataclass

import torch

from accelerate import Accelerator, DistributedDataParallelKwargs, GradScalerKwargs
from accelerate.state import AcceleratorState
from accelerate.test_utils import (
    DEFAULT_LAUNCH_COMMAND,
    execute_subprocess_async,
    path_in_accelerate_package,
    require_fp16,
    require_multi_device,
    require_non_cpu,
    run_first,
)
from accelerate.test_utils.testing import AccelerateTestCase, slow
from accelerate.utils import (
    AutocastKwargs,
    KwargsHandler,
    ProfileKwargs,
    TorchDynamoPlugin,
    clear_environment,
)
from accelerate.utils.dataclasses import DistributedType


@dataclass
class MockClass(KwargsHandler):
    a: int = 0
    b: bool = False
    c: float = 3.0


class KwargsHandlerTester(AccelerateTestCase):
    def test_kwargs_handler(self):
        # If no defaults are changed, `to_kwargs` returns an empty dict.
        assert MockClass().to_kwargs() == {}
        assert MockClass(a=2).to_kwargs() == {"a": 2}
        assert MockClass(a=2, b=True).to_kwargs() == {"a": 2, "b": True}
        assert MockClass(a=2, c=2.25).to_kwargs() == {"a": 2, "c": 2.25}

    @require_fp16
    @require_non_cpu
    def test_grad_scaler_kwargs(self):
        # If no defaults are changed, `to_kwargs` returns an empty dict.
        scaler_handler = GradScalerKwargs(init_scale=1024, growth_factor=2)
        AcceleratorState._reset_state()
        accelerator = Accelerator(mixed_precision="fp16", kwargs_handlers=[scaler_handler])
        assert accelerator.mixed_precision == "fp16"
        scaler = accelerator.scaler

        # Check the kwargs have been applied
        assert scaler._init_scale == 1024.0
        assert scaler._growth_factor == 2.0

        # Check the other values are at the default
        assert scaler._backoff_factor == 0.5
        assert scaler._growth_interval == 2000
        assert scaler._enabled is True

    @run_first
    @require_multi_device
    def test_ddp_kwargs(self):
        cmd = DEFAULT_LAUNCH_COMMAND + [inspect.getfile(self.__class__)]
        execute_subprocess_async(cmd)

    @require_fp16
    @require_non_cpu
    def test_autocast_kwargs(self):
        kwargs = AutocastKwargs(enabled=False)
        AcceleratorState._reset_state()
        accelerator = Accelerator(mixed_precision="fp16")

        a_float32 = torch.rand((8, 8), device=accelerator.device)
        b_float32 = torch.rand((8, 8), device=accelerator.device)
        c_float32 = torch.rand((8, 8), device=accelerator.device)
        d_float32 = torch.rand((8, 8), device=accelerator.device)

        with accelerator.autocast():
            e_float16 = torch.mm(a_float32, b_float32)
            assert e_float16.dtype == torch.float16

            with accelerator.autocast(autocast_handler=kwargs):
                # Convert e_float16 to float32
                f_float32 = torch.mm(c_float32, e_float16.float())
                assert f_float32.dtype == torch.float32

            g_float16 = torch.mm(d_float32, f_float32)
            # We should be back in fp16
            assert g_float16.dtype == torch.float16

    @slow
    def test_profile_kwargs(self):
        # Arrange
        schedule_options = [
            dict(wait=1, warmup=1, active=2, repeat=1),
            dict(wait=2, warmup=2, active=2, repeat=2),
            dict(wait=0, warmup=1, active=3, repeat=3, skip_first=1),
            dict(wait=3, warmup=2, active=1, repeat=1, skip_first=2),
            dict(wait=1, warmup=0, active=1, repeat=5),
        ]

        total_steps = 100

        for option in schedule_options:
            count = 0
            table_outputs = []
            steps_per_cycle = option["wait"] + option["warmup"] + option["active"]
            effective_steps = max(0, total_steps - option.get("skip_first", 0))
            cycles = effective_steps // steps_per_cycle
            if option["repeat"] > 0:
                expected_count = min(cycles, option["repeat"])
            else:
                expected_count = cycles

            def on_trace_ready(prof):
                nonlocal count
                nonlocal table_outputs

                count += 1
                table_outputs.append(prof.key_averages().table(sort_by="cpu_time_total", row_limit=-1))

            kwargs = ProfileKwargs(activities=["cpu"], on_trace_ready=on_trace_ready, schedule_option=option)
            accelerator = Accelerator(kwargs_handlers=[kwargs])

            # Act
            with accelerator.profile() as prof:
                for _ in range(total_steps):
                    prof.step()
                    torch.tensor([1, 2, 3, 4, 5], device=accelerator.device)

            # Assert
            assert isinstance(prof, torch.profiler.profile)
            assert count == expected_count, f"Option: {option}, Expected count: {expected_count}, but got {count}"
            for output in table_outputs:
                self.assertIn("CPU time total:", output)

    def test_torch_dynamo_plugin(self):
        with clear_environment():
            prefix = "ACCELERATE_DYNAMO_"
            # nvfuser's dynamo backend name is "nvprims_nvfuser"
            # use "nvfuser" here to cause exception if this test causes os.environ changed permanently
            os.environ[prefix + "BACKEND"] = "aot_ts_nvfuser"
            os.environ[prefix + "MODE"] = "reduce-overhead"

            dynamo_plugin_kwargs = TorchDynamoPlugin().to_kwargs()
            assert dynamo_plugin_kwargs == {"backend": "aot_ts_nvfuser", "mode": "reduce-overhead"}
        assert os.environ.get(prefix + "BACKEND") != "aot_ts_nvfuser"

    @run_first
    @require_multi_device
    def test_ddp_comm_hook(self):
        cmd = DEFAULT_LAUNCH_COMMAND + [path_in_accelerate_package("test_utils", "scripts", "test_ddp_comm_hook.py")]
        execute_subprocess_async(cmd)


def main():
    ddp_scaler = DistributedDataParallelKwargs(bucket_cap_mb=15, find_unused_parameters=True)
    accelerator = Accelerator(kwargs_handlers=[ddp_scaler])

    # Skip this test due to TorchXLA not using torch.nn.parallel.DistributedDataParallel for model wrapping.
    if accelerator.distributed_type == DistributedType.XLA:
        return

    model = torch.nn.Linear(100, 200)
    model = accelerator.prepare(model)

    # Check the values changed in kwargs
    error_msg = ""
    observed_bucket_cap_map = model.bucket_bytes_cap // (1024 * 1024)
    if observed_bucket_cap_map != 15:
        error_msg += f"Kwargs badly passed, should have `15` but found {observed_bucket_cap_map}.\n"
    if model.find_unused_parameters is not True:
        error_msg += f"Kwargs badly passed, should have `True` but found {model.find_unused_parameters}.\n"

    # Check the values of the defaults
    if model.dim != 0:
        error_msg += f"Default value not respected, should have `0` but found {model.dim}.\n"
    if model.broadcast_buffers is not True:
        error_msg += f"Default value not respected, should have `True` but found {model.broadcast_buffers}.\n"
    if model.gradient_as_bucket_view is not False:
        error_msg += f"Default value not respected, should have `False` but found {model.gradient_as_bucket_view}.\n"

    # Raise error at the end to make sure we don't stop at the first failure.
    if len(error_msg) > 0:
        raise ValueError(error_msg)


if __name__ == "__main__":
    main()


================================================
FILE: tests/test_launch.py
================================================
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import unittest

from accelerate.utils.launch import prepare_multi_gpu_env


class TestPrepareMultiGpuEnv(unittest.TestCase):
    def test_auto_port_selection(self):
        args = argparse.Namespace(
            num_processes=1,
            num_machines=1,
            main_process_ip="127.0.0.1",
            main_process_port=0,
            machine_rank=0,
            module=False,
            no_python=False,
            debug=False,
            gpu_ids="all",
            mixed_precision="no",
            dynamo_backend="NO",
            dynamo_mode="default",
            dynamo_use_fullgraph=False,
            dynamo_use_dynamic=False,
            dynamo_use_regional_compilation=False,
            use_fsdp=False,
            fsdp_cpu_ram_efficient_loading=False,
            fsdp_sync_module_states=False,
            fsdp_version=None,
            fsdp_sharding_strategy=None,
            fsdp_reshard_after_forward=False,
            fsdp_offload_params=False,
            fsdp_min_num_params=0,
            fsdp_auto_wrap_policy=None,
            fsdp_transformer_layer_cls_to_wrap=None,
            fsdp_backward_prefetch=None,
            fsdp_state_dict_type=None,
            fsdp_forward_prefetch=False,
            fsdp_use_orig_params=False,
            fsdp_activation_checkpointing=False,
            use_tp=False,
            tp_size=1,
            use_megatron_lm=False,
            megatron_lm_tp_degree=1,
            megatron_lm_pp_degree=1,
            megatron_lm_gradient_clipping=1.0,
            megatron_lm_num_micro_batches=None,
            megatron_lm_sequence_parallelism=None,
            megatron_lm_recompute_activations=None,
            megatron_lm_use_distributed_optimizer=None,
            num_cpu_threads_per_process=1,
            enable_cpu_affinity=False,
            same_network=False,
            use_parallelism_config=False,
        )

        prepare_multi_gpu_env(args)
        self.assertIn("master_port", args.__dict__)
        self.assertNotEqual(args.master_port, "0")
        self.assertTrue(args.master_port.isdigit())


================================================
FILE: tests/test_load_checkpoint_and_dispatch_with_broadcast.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import functools
import itertools
import unittest
from typing import Any, Callable

import torch
from huggingface_hub import hf_hub_download
from torch import distributed as dist
from torch import nn
from torch.distributed._composable.fsdp import fully_shard
from torch.distributed._tensor import DTensor
from torch.distributed.device_mesh import init_device_mesh
from torch.distributed.fsdp.wrap import _recursive_wrap, transformer_auto_wrap_policy
from torch.nn.parallel import DistributedDataParallel

from accelerate import init_empty_weights, load_checkpoint_and_dispatch
from accelerate.test_utils import (
    execute_subprocess_async,
    get_torch_dist_unique_port,
    require_multi_device,
    run_first,
    torch_device,
)
from accelerate.test_utils.testing import require_torch_min_version, require_transformers
from accelerate.utils.imports import is_hpu_available, is_transformers_available


if is_transformers_available():
    from transformers import AutoConfig, AutoModel


def manage_process_group(func: Callable[..., Any]) -> Callable[..., Any]:
    """Manage the creation and destruction of the distributed process group for the wrapped function."""

    def wrapped(*args: Any, **kwargs: Any) -> Any:
        torch_accelerator_module = getattr(torch, torch_device, torch.cuda)
        initialized_here = False
        if not dist.is_initialized():
            if torch_device == "hpu" and is_hpu_available(init_hccl=True):
                dist.init_process_group(backend="hccl", world_size=torch_accelerator_module.device_count())
            else:
                dist.init_process_group(world_size=torch_accelerator_module.device_count())
            initialized_here = True
        try:
            return func(*args, **kwargs)
        finally:
            if initialized_here:
                dist.destroy_process_group()

    return wrapped


@manage_process_group
def load_checkpoint_and_dispatch_fsdp2():
    torch_accelerator_module = getattr(torch, torch_device, torch.cuda)
    torch_accelerator_module.set_device(device := torch.device(dist.get_rank()))

    pretrained_model_name_or_path = "bigscience/bloom-560m"
    model_path = hf_hub_download("bigscience/bloom-560m", "pytorch_model.bin")

    model = AutoModel.from_pretrained(pretrained_model_name_or_path, device_map=device, torch_dtype=torch.float32)
    assert isinstance(model, nn.Module)

    with init_empty_weights():
        config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
        fsdp2_model = AutoModel.from_config(config)
        fsdp2_model.tie_weights()
        assert isinstance(fsdp2_model, nn.Module)

    from transformers.models.gpt2.modeling_gpt2 import GPT2Block

    mesh = init_device_mesh(device.type, (dist.get_world_size(),))
    fsdp2_model, _ = _recursive_wrap(
        fsdp2_model,
        auto_wrap_policy=functools.partial(
            transformer_auto_wrap_policy,
            transformer_layer_cls={
                GPT2Block,
                type(fsdp2_model),
            },
        ),
        wrapper_cls=functools.partial(
            fully_shard,
            mesh=mesh,
        ),
        ignored_modules=set(),
        ignored_params=set(),
    )

    fsdp2_model._apply(
        lambda t: torch.empty_like(t, device=device) if t.device == torch.device("meta") else t.to(device)
    )

    load_checkpoint_and_dispatch(fsdp2_model, model_path, strict=True, broadcast_from_rank0=True)

    for (name, tensor), (fsdp2_name, fsdp2_tensor) in zip(
        itertools.chain(model.named_parameters(), model.named_buffers()),
        itertools.chain(fsdp2_model.named_parameters(), fsdp2_model.named_buffers()),
    ):
        assert name == fsdp2_name
        assert isinstance(fsdp2_tensor, DTensor), fsdp2_name
        torch.testing.assert_close(tensor, fsdp2_tensor.full_tensor(), msg=fsdp2_name)


@manage_process_group
def load_checkpoint_and_dispatch_no_broadcast_from_rank0():
    torch_accelerator_module = getattr(torch, torch_device, torch.cuda)
    torch_accelerator_module.set_device(device := torch.device(dist.get_rank()))

    pretrained_model_name_or_path = "bigscience/bloom-560m"
    model_path = hf_hub_download("bigscience/bloom-560m", "pytorch_model.bin")

    with init_empty_weights():
        config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
        broadcasted_model = AutoModel.from_config(config)
        broadcasted_model.tie_weights()
        assert isinstance(broadcasted_model, nn.Module)

    broadcasted_model._apply(
        lambda t: torch.empty_like(t, device=device) if t.device == torch.device("meta") else t.to(device)
    )

    load_checkpoint_and_dispatch(broadcasted_model, model_path, strict=True, broadcast_from_rank0=True)

    with init_empty_weights():
        config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
        non_broadcasted_model = AutoModel.from_config(config)
        non_broadcasted_model.tie_weights()
        assert isinstance(non_broadcasted_model, nn.Module)

    non_broadcasted_model._apply(
        lambda t: torch.empty_like(t, device=device) if t.device == torch.device("meta") else t.to(device)
    )

    load_checkpoint_and_dispatch(non_broadcasted_model, model_path, strict=True, broadcast_from_rank0=False)

    for (broadcasted_name, broadcasted_tensor), (non_broadcasted_name, non_broadcasted_tensor) in zip(
        itertools.chain(broadcasted_model.named_parameters(), broadcasted_model.named_buffers()),
        itertools.chain(non_broadcasted_model.named_parameters(), non_broadcasted_model.named_buffers()),
    ):
        assert broadcasted_name == non_broadcasted_name
        torch.testing.assert_close(broadcasted_tensor, non_broadcasted_tensor, msg=broadcasted_name)


@manage_process_group
def load_checkpoint_and_dispatch_ddp():
    torch_accelerator_module = getattr(torch, torch_device, torch.cuda)
    torch_accelerator_module.set_device(device := torch.device(dist.get_rank()))

    pretrained_model_name_or_path = "bigscience/bloom-560m"
    model_path = hf_hub_download("bigscience/bloom-560m", "pytorch_model.bin")

    model = AutoModel.from_pretrained(pretrained_model_name_or_path, device_map=device, torch_dtype=torch.float32)
    assert isinstance(model, nn.Module)

    with init_empty_weights():
        config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
        ddp_model = AutoModel.from_config(config)
        ddp_model.tie_weights()
        assert isinstance(ddp_model, nn.Module)

    ddp_model._apply(
        lambda t: torch.empty_like(t, device=device) if t.device == torch.device("meta") else t.to(device)
    )
    ddp_model = DistributedDataParallel(ddp_model)

    load_checkpoint_and_dispatch(ddp_model.module, model_path, strict=True, broadcast_from_rank0=True)

    for (name, tensor), (ddp_name, ddp_tensor) in zip(
        itertools.chain(model.named_parameters(), model.named_buffers()),
        itertools.chain(ddp_model.module.named_parameters(), ddp_model.module.named_buffers()),
    ):
        assert name == ddp_name
        torch.testing.assert_close(tensor, ddp_tensor, msg=ddp_name)


@require_torch_min_version(version="2.4.0")
@require_transformers
@require_multi_device
@run_first
class TestLoadCheckpointAndDispatchWithBroadcast(unittest.TestCase):
    def setUp(self):
        self.torch_accelerator_module = getattr(torch, torch_device, torch.cuda)

    def test_load_checkpoint_and_dispatch_fsdp2(self):
        execute_subprocess_async(
            cmd=[
                "torchrun",
                f"--nproc_per_node={self.torch_accelerator_module.device_count()}",
                f"--master_port={get_torch_dist_unique_port()}",
                __file__,
                "--fsdp2",
            ],
        )
        # successful return here == success - any errors would have caused an error in the sub-call

    def test_load_checkpoint_and_dispatch_no_broadcast_from_rank0(self):
        execute_subprocess_async(
            cmd=[
                "torchrun",
                f"--nproc_per_node={self.torch_accelerator_module.device_count()}",
                f"--master_port={get_torch_dist_unique_port()}",
                __file__,
                "--no_broadcast_from_rank0",
            ],
        )
        # successful return here == success - any errors would have caused an error in the sub-call

    def test_load_checkpoint_and_dispatch_ddp(self):
        execute_subprocess_async(
            cmd=[
                "torchrun",
                f"--nproc_per_node={self.torch_accelerator_module.device_count()}",
                f"--master_port={get_torch_dist_unique_port()}",
                __file__,
                "--ddp",
            ],
        )
        # successful return here == success - any errors would have caused an error in the sub-call


if __name__ == "__main__":
    # The script below is meant to be run under torch.distributed, on a machine with multiple GPUs:
    #
    # PYTHONPATH="src" python -m torch.distributed.run --nproc_per_node 2 --output_dir output_dir ./tests/test_fsdp2.py --fsdp2

    class CLIArgs(argparse.Namespace):
        fsdp2: bool
        ddp: bool
        no_broadcast_from_rank0: bool

    parser = argparse.ArgumentParser()
    group = parser.add_mutually_exclusive_group()
    group.add_argument("--fsdp2", action="store_true")
    group.add_argument("--ddp", action="store_true")
    group.add_argument("--no_broadcast_from_rank0", action="store_true")
    args = parser.parse_args(namespace=CLIArgs())

    if args.fsdp2:
        load_checkpoint_and_dispatch_fsdp2()
    elif args.ddp:
        load_checkpoint_and_dispatch_ddp()
    elif args.no_broadcast_from_rank0:
        load_checkpoint_and_dispatch_no_broadcast_from_rank0()
    else:
        raise ValueError("Missing test selection")


================================================
FILE: tests/test_logging.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import inspect
import logging
import os

import pytest

from accelerate import Accelerator
from accelerate.logging import get_logger
from accelerate.state import AcceleratorState


def current_lineno() -> int:
    # A simple helper that returns the lineno of its call-site.
    caller_frame = inspect.currentframe().f_back
    caller_info = inspect.getframeinfo(caller_frame)
    return caller_info.lineno


class CustomLogger(logging.LoggerAdapter):
    # Mocks a user-defined custom logger wrapper that sets `stacklevel=3`.
    def log(self, level, msg, *args, **kwargs):
        # E.g. the user wants to modify `stacklevel`, `accelerate.logging`
        # should respect the user's `stacklevel`. For the specific value
        # of `3`, calling `CustomLogger.log()`, etc., should log that callsite,
        # rather than the callsite of the following `self.logger.log()`.
        kwargs["stacklevel"] = 3
        self.logger.log(level, msg, *args, **kwargs)


@pytest.fixture(scope="module")
def accelerator():
    accelerator = Accelerator()
    yield accelerator
    AcceleratorState._reset_state(True)


@pytest.mark.usefixtures("accelerator")
def test_log_stack(caplog):
    logger = get_logger(__name__)
    logging.basicConfig(
        format="%(filename)s:%(name)s:%(lineno)s:%(funcName)s - %(message)s",
        datefmt="%m/%d %H:%M:%S",
    )

    message = "Test"
    expected_message, _ = logger.process(message, {})
    lineno = current_lineno() + 1  # the next line is the actual callsite
    logger.warning(message)

    assert len(caplog.records) == 1
    rec = caplog.records[0]
    assert rec.levelname == logging.getLevelName(logging.WARNING)
    assert rec.filename == os.path.basename(__file__)
    assert rec.name == __name__
    assert rec.lineno == lineno
    assert rec.funcName == test_log_stack.__name__
    assert rec.message == expected_message


@pytest.mark.usefixtures("accelerator")
def test_custom_stacklevel(caplog):
    wrapped_logger = get_logger(__name__)
    logging.basicConfig(
        format="%(filename)s:%(name)s:%(lineno)s:%(funcName)s - %(message)s",
        datefmt="%m/%d %H:%M:%S",
    )
    logger = CustomLogger(wrapped_logger, {})

    message = "Test"
    expected_message, _ = wrapped_logger.process(message, {})
    lineno = current_lineno() + 1  # the next line is the actual callsite
    logger.warning(message)

    # `CustomLogger.log` set custom `stacklevel=3`, so `logger.warning` should
    # log its callsite (rather than those of the `warpped_logger`).
    assert len(caplog.records) == 1
    rec = caplog.records[0]
    assert rec.levelname == logging.getLevelName(logging.WARNING)
    assert rec.filename == os.path.basename(__file__)
    assert rec.name == __name__
    assert rec.lineno == lineno
    assert rec.funcName == test_custom_stacklevel.__name__
    assert rec.message == expected_message


================================================
FILE: tests/test_memory_utils.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest

from torch import nn

from accelerate.test_utils import (
    memory_allocated_func,
    require_non_cpu,
    require_non_torch_xla,
    torch_device,
)
from accelerate.utils.memory import find_executable_batch_size, release_memory


def raise_fake_out_of_memory():
    raise RuntimeError(f"{torch_device.upper()} out of memory.")


class ModelForTest(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = nn.Linear(3, 4)
        self.batchnorm = nn.BatchNorm1d(4)
        self.linear2 = nn.Linear(4, 5)

    def forward(self, x):
        return self.linear2(self.batchnorm(self.linear1(x)))


class BigModelForTest(ModelForTest):
    def __init__(self):
        super().__init__()
        self.linear3 = nn.Linear(5, 1000)

    def forward(self, x):
        return self.linear3(super().forward(x))


class MemoryTest(unittest.TestCase):
    def test_memory_implicit(self):
        batch_sizes = []

        @find_executable_batch_size(starting_batch_size=128)
        def mock_training_loop_function(batch_size):
            nonlocal batch_sizes
            batch_sizes.append(batch_size)
            if batch_size != 8:
                raise_fake_out_of_memory()

        mock_training_loop_function()
        assert batch_sizes == [
            128,
            115,
            103,
            92,
            82,
            73,
            65,
            58,
            52,
            46,
            41,
            36,
            32,
            28,
            25,
            22,
            19,
            17,
            15,
            13,
            11,
            9,
            8,
        ]

    def test_memory_explicit(self):
        batch_sizes = []

        @find_executable_batch_size(starting_batch_size=128)
        def mock_training_loop_function(batch_size, arg1):
            nonlocal batch_sizes
            batch_sizes.append(batch_size)
            if batch_size != 8:
                raise_fake_out_of_memory()
            return batch_size, arg1

        bs, arg1 = mock_training_loop_function("hello")
        assert batch_sizes == [
            128,
            115,
            103,
            92,
            82,
            73,
            65,
            58,
            52,
            46,
            41,
            36,
            32,
            28,
            25,
            22,
            19,
            17,
            15,
            13,
            11,
            9,
            8,
        ]
        assert [bs, arg1] == [8, "hello"]

    def test_start_zero(self):
        @find_executable_batch_size(starting_batch_size=0)
        def mock_training_loop_function(batch_size):
            pass

        with self.assertRaises(RuntimeError) as cm:
            mock_training_loop_function()
            assert "No executable batch size found, reached zero." in cm.exception.args[0]

    def test_approach_zero(self):
        @find_executable_batch_size(starting_batch_size=16)
        def mock_training_loop_function(batch_size):
            if batch_size > 0:
                raise_fake_out_of_memory()
            pass

        with self.assertRaises(RuntimeError) as cm:
            mock_training_loop_function()
            assert "No executable batch size found, reached zero." in cm.exception.args[0]

    def test_verbose_guard(self):
        @find_executable_batch_size(starting_batch_size=128)
        def mock_training_loop_function(batch_size, arg1, arg2):
            if batch_size != 8:
                raise raise_fake_out_of_memory()

        with self.assertRaises(TypeError) as cm:
            mock_training_loop_function(128, "hello", "world")
            assert "Batch size was passed into `f`" in cm.exception.args[0]
            assert "`f(arg1='hello', arg2='world')" in cm.exception.args[0]

    def test_any_other_error(self):
        @find_executable_batch_size(starting_batch_size=16)
        def mock_training_loop_function(batch_size):
            raise ValueError("Oops, we had an error!")

        with self.assertRaises(ValueError) as cm:
            mock_training_loop_function()
            assert "Oops, we had an error!" in cm.exception.args[0]

    @require_non_cpu
    @require_non_torch_xla
    def test_release_memory(self):
        starting_memory = memory_allocated_func()

        if torch_device.startswith("hpu"):
            # hpu has a minimum memory allocation that cannot be released,
            # we need to surpass it by using a bigger model (>5767296 bytes)
            model = BigModelForTest()
        else:
            model = ModelForTest()

        model.to(torch_device)
        assert memory_allocated_func() > starting_memory
        model = release_memory(model)
        assert memory_allocated_func() == starting_memory


================================================
FILE: tests/test_metrics.py
================================================
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest

import numpy as np
from packaging import version

from accelerate import debug_launcher
from accelerate.test_utils import (
    DEFAULT_LAUNCH_COMMAND,
    device_count,
    execute_subprocess_async,
    path_in_accelerate_package,
    require_cpu,
    require_huggingface_suite,
    require_multi_device,
    require_single_device,
    run_first,
)
from accelerate.utils import patch_environment


@require_huggingface_suite
@unittest.skipIf(version.parse(np.__version__) >= version.parse("2.0"), "Test requires numpy version < 2.0")
class MetricTester(unittest.TestCase):
    def setUp(self):
        self.test_file_path = path_in_accelerate_package("test_utils", "scripts", "external_deps", "test_metrics.py")

        from accelerate.test_utils.scripts.external_deps import test_metrics  # noqa: F401

        self.test_metrics = test_metrics

    @require_cpu
    def test_metric_cpu_noop(self):
        debug_launcher(self.test_metrics.main, num_processes=1)

    @require_cpu
    def test_metric_cpu_multi(self):
        debug_launcher(self.test_metrics.main)

    @require_single_device
    def test_metric_accelerator(self):
        self.test_metrics.main()

    @run_first
    @require_multi_device
    def test_metric_accelerator_multi(self):
        print(f"Found {device_count} devices.")
        cmd = DEFAULT_LAUNCH_COMMAND + [self.test_file_path]
        with patch_environment(omp_num_threads=1, ACCELERATE_LOG_LEVEL="INFO"):
            execute_subprocess_async(cmd)


================================================
FILE: tests/test_modeling_utils.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import os
import tempfile
import unittest
import warnings
from collections import OrderedDict
from typing import Optional

import torch
import torch.nn as nn
from parameterized import parameterized
from safetensors.torch import save_file

from accelerate import init_empty_weights
from accelerate.big_modeling import cpu_offload
from accelerate.test_utils import (
    require_huggingface_suite,
    require_multi_device,
    require_non_cpu,
    require_non_hpu,
    torch_device,
)
from accelerate.utils.modeling import (
    align_module_device,
    check_device_map,
    clean_device_map,
    compute_module_sizes,
    compute_module_total_buffer_size,
    convert_file_size_to_int,
    find_tied_parameters,
    get_balanced_memory,
    get_module_size_with_ties,
    get_state_dict_offloaded_model,
    infer_auto_device_map,
    load_checkpoint_in_model,
    load_state_dict,
    named_module_tensors,
    retie_parameters,
    set_module_tensor_to_device,
)
from accelerate.utils.other import extract_model_from_parallel


torch_device = f"{torch_device}:0" if torch_device != "cpu" else "cpu"


class ModelForTest(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = nn.Linear(3, 4)
        self.batchnorm = nn.BatchNorm1d(4)
        self.linear2 = nn.Linear(4, 5)

    def forward(self, x):
        return self.linear2(self.batchnorm(self.linear1(x)))


class NestedModelForTest(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = ModelForTest()

    def forward(self, x):
        return self.model(x)


class LinearWithNonPersistentBuffers(nn.Module):
    def __init__(self, in_features: int, out_features: int, bias: bool = True, device=None, dtype=None) -> None:
        factory_kwargs = {"device": device, "dtype": dtype}
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.register_buffer("weight", torch.empty((out_features, in_features), **factory_kwargs))
        if bias:
            self.register_buffer("bias", torch.empty(out_features, **factory_kwargs), persistent=False)
        else:
            self.register_buffer("bias", None)

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        return torch.nn.functional.linear(input, self.weight, self.bias)


class ModelSeveralDtypes(nn.Module):
    def __init__(self):
        super().__init__()
        self.register_buffer("int_param", torch.randint(high=10, size=(15, 30)))
        self.register_parameter("float_param", torch.nn.Parameter(torch.rand(10, 5)))

    def forward(self, x):
        return x + 2


def sequential_model(num_layers):
    layers = OrderedDict([(f"linear{i}", nn.Linear(1000, 1000)) for i in range(1, num_layers + 1)])
    return nn.Sequential(layers)


class ModelingUtilsTester(unittest.TestCase):
    def check_set_module_tensor_for_device(self, model, device1, device2):
        assert model.linear1.weight.device == torch.device(device1)

        with self.subTest("Access by submodule and direct name for a parameter"):
            set_module_tensor_to_device(model.linear1, "weight", device2)
            assert model.linear1.weight.device == torch.device(device2)

            if torch.device(device2) == torch.device("meta"):
                with self.assertRaises(ValueError):
                    # We need a `value` to set the weight back on device1
                    set_module_tensor_to_device(model.linear1, "weight", device1)

                set_module_tensor_to_device(model.linear1, "weight", device1, value=torch.randn(4, 3))
            else:
                set_module_tensor_to_device(model.linear1, "weight", device1)
            assert model.linear1.weight.device == torch.device(device1)

        with self.subTest("Access by module and full name for a parameter"):
            set_module_tensor_to_device(model, "linear1.weight", device2)
            assert model.linear1.weight.device == torch.device(device2)

            if torch.device(device2) == torch.device("meta"):
                with self.assertRaises(ValueError):
                    # We need a `value` to set the weight back on device1
                    set_module_tensor_to_device(model, "linear1.weight", device1)
                set_module_tensor_to_device(model, "linear1.weight", device1, value=torch.randn(4, 3))
            else:
                set_module_tensor_to_device(model, "linear1.weight", device1)
            assert model.linear1.weight.device == torch.device(device1)

        assert model.batchnorm.running_mean.device == torch.device(device1)

        with self.subTest("Access by submodule and direct name for a buffer"):
            set_module_tensor_to_device(model.batchnorm, "running_mean", device2)
            assert model.batchnorm.running_mean.device == torch.device(device2)

            if torch.device(device2) == torch.device("meta"):
                with self.assertRaises(ValueError):
                    # We need a `value` to set the weight back on device1
                    set_module_tensor_to_device(model.batchnorm, "running_mean", device1)
                set_module_tensor_to_device(model.batchnorm, "running_mean", device1, value=torch.randn(4))
            else:
                set_module_tensor_to_device(model.batchnorm, "running_mean", device1)
            assert model.batchnorm.running_mean.device == torch.device(device1)

        with self.subTest("Access by module and full name for a parameter"):
            set_module_tensor_to_device(model, "batchnorm.running_mean", device2)
            assert model.batchnorm.running_mean.device == torch.device(device2)

            if torch.device(device2) == torch.device("meta"):
                with self.assertRaises(ValueError):
                    # We need a `value` to set the weight back on CPU
                    set_module_tensor_to_device(model, "batchnorm.running_mean", device1)

                set_module_tensor_to_device(model, "batchnorm.running_mean", device1, value=torch.randn(4))
            else:
                set_module_tensor_to_device(model, "batchnorm.running_mean", device1)
            assert model.batchnorm.running_mean.device == torch.device(device1)

    def test_set_module_tensor_to_meta_and_cpu(self):
        model = ModelForTest()
        self.check_set_module_tensor_for_device(model, "cpu", "meta")

    @require_non_cpu
    def test_set_module_tensor_to_cpu_and_gpu(self):
        model = ModelForTest()
        self.check_set_module_tensor_for_device(model, "cpu", torch_device)

    @require_non_cpu
    def test_set_module_tensor_to_meta_and_gpu(self):
        model = ModelForTest().to(torch_device)
        self.check_set_module_tensor_for_device(model, torch_device, "meta")

    @require_non_hpu  # hpu does not support device indexing "hpu:1"
    @require_multi_device
    def test_set_module_tensor_between_gpus(self):
        model = ModelForTest().to(torch_device)
        self.check_set_module_tensor_for_device(model, torch_device, torch_device.replace("0", "1"))

    def test_set_module_tensor_sets_dtype(self):
        model = ModelForTest()
        set_module_tensor_to_device(model, "linear1.weight", "cpu", value=model.linear1.weight, dtype=torch.float16)
        assert model.linear1.weight.dtype == torch.float16

    def test_set_module_tensor_checks_shape(self):
        model = ModelForTest()
        tensor = torch.zeros((2, 2))
        with self.assertRaises(ValueError) as cm:
            set_module_tensor_to_device(model, "linear1.weight", "cpu", value=tensor)
        assert (
            str(cm.exception)
            == 'Trying to set a tensor of shape torch.Size([2, 2]) in "weight" (which has shape torch.Size([4, 3])), this looks incorrect.'
        )

    def test_named_tensors(self):
        model = nn.BatchNorm1d(4)
        named_tensors = named_module_tensors(model)
        assert [name for name, _ in named_tensors] == [
            "weight",
            "bias",
            "running_mean",
            "running_var",
            "num_batches_tracked",
        ]

        named_tensors = named_module_tensors(model, include_buffers=False)
        assert [name for name, _ in named_tensors] == ["weight", "bias"]

        model = ModelForTest()
        named_tensors = named_module_tensors(model)
        assert [name for name, _ in named_tensors] == []

        named_tensors = named_module_tensors(model, recurse=True)
        assert [name for name, _ in named_tensors] == [
            "linear1.weight",
            "linear1.bias",
            "batchnorm.weight",
            "batchnorm.bias",
            "linear2.weight",
            "linear2.bias",
            "batchnorm.running_mean",
            "batchnorm.running_var",
            "batchnorm.num_batches_tracked",
        ]

        named_tensors = named_module_tensors(model, include_buffers=False, recurse=True)
        assert [name for name, _ in named_tensors] == [
            "linear1.weight",
            "linear1.bias",
            "batchnorm.weight",
            "batchnorm.bias",
            "linear2.weight",
            "linear2.bias",
        ]

        model = LinearWithNonPersistentBuffers(10, 10)

        named_tensors = named_module_tensors(model, include_buffers=True, remove_non_persistent=False)
        assert [name for name, _ in named_tensors] == ["weight", "bias"]

        named_tensors = named_module_tensors(model, include_buffers=True, remove_non_persistent=True)
        assert [name for name, _ in named_tensors] == ["weight"]

    def test_find_tied_parameters(self):
        model = sequential_model(4)
        assert find_tied_parameters(model) == []

        model.linear2.weight = model.linear1.weight
        assert find_tied_parameters(model) == [["linear1.weight", "linear2.weight"]]

        model.linear4.weight = model.linear1.weight
        assert find_tied_parameters(model) == [["linear1.weight", "linear2.weight", "linear4.weight"]]

        model = sequential_model(5)
        model.linear1.weight = model.linear4.weight
        model.linear2.weight = model.linear3.weight
        model.linear5.weight = model.linear2.weight
        tied_params = sorted(find_tied_parameters(model), key=lambda x: len(x))
        assert tied_params == [
            ["linear1.weight", "linear4.weight"],
            ["linear2.weight", "linear3.weight", "linear5.weight"],
        ]

        model = nn.Sequential(OrderedDict([("block1", sequential_model(4)), ("block2", sequential_model(4))]))
        model.block1.linear1.weight = model.block2.linear1.weight
        assert find_tied_parameters(model) == [["block1.linear1.weight", "block2.linear1.weight"]]

        layer = nn.Linear(10, 10)
        model = nn.Sequential(layer, layer)
        tied_params = find_tied_parameters(model)
        assert sorted(tied_params) == [["0.bias", "1.bias"], ["0.weight", "1.weight"]]

    def test_retie_parameters(self):
        model = sequential_model(2)
        retie_parameters(model, [["linear1.weight", "linear2.weight"]])
        assert model.linear1.weight is model.linear2.weight

        model = sequential_model(3)
        retie_parameters(model, [["linear1.weight", "linear2.weight", "linear3.weight"]])

        assert model.linear1.weight is model.linear2.weight
        assert model.linear1.weight is model.linear3.weight

        model = sequential_model(5)
        retie_parameters(
            model, [["linear1.weight", "linear4.weight"], ["linear2.weight", "linear3.weight", "linear5.weight"]]
        )

        assert model.linear1.weight is model.linear4.weight
        assert model.linear2.weight is model.linear3.weight
        assert model.linear2.weight is model.linear5.weight

        model = nn.Sequential(OrderedDict([("block1", sequential_model(4)), ("block2", sequential_model(4))]))
        retie_parameters(model, [["block1.linear1.weight", "block2.linear1.weight"]])

        assert model.block1.linear1.weight is model.block2.linear1.weight

    def test_compute_module_sizes(self):
        model = ModelForTest()
        expected_sizes = {"": 236, "linear1": 64, "linear1.weight": 48, "linear1.bias": 16}
        expected_sizes.update({"linear2": 100, "linear2.weight": 80, "linear2.bias": 20})
        expected_sizes.update({"batchnorm": 72, "batchnorm.weight": 16, "batchnorm.bias": 16})
        expected_sizes.update(
            {"batchnorm.running_mean": 16, "batchnorm.running_var": 16, "batchnorm.num_batches_tracked": 8}
        )

        module_sizes = compute_module_sizes(model)
        assert module_sizes == expected_sizes

        model.half()
        expected_sizes = {k: s // 2 for k, s in expected_sizes.items()}
        # This one is not converted to half.
        expected_sizes["batchnorm.num_batches_tracked"] = 8
        # This impacts batchnorm and total
        expected_sizes["batchnorm"] += 4
        expected_sizes[""] += 4

        module_sizes = compute_module_sizes(model)
        assert module_sizes == expected_sizes

    def test_compute_module_total_buffer_size(self):
        model = ModelForTest()
        model.linear1.register_buffer("test_buffer", torch.zeros(10, 10))
        model.register_buffer("test_buffer2", torch.zeros(20, 10))

        buffer_size = compute_module_total_buffer_size(model)
        assert buffer_size == 1240

        model.half()
        buffer_size = compute_module_total_buffer_size(model)
        assert buffer_size == 624

    def test_check_device_map(self):
        model = ModelForTest()
        check_device_map(model, {"": 0})
        with self.assertRaises(ValueError):
            check_device_map(model, {"linear1": 0, "linear2": 1})

        check_device_map(model, {"linear1": 0, "linear2": 1, "batchnorm": 1})

    def test_check_device_map_invalid_keys(self):
        model = ModelForTest()

        device_map = {
            "linear1": "cpu",  # Valid module
            "batchnorm": "cpu",  # Valid module
            "linear2": "cpu",  # Valid module
            "invalid_module": 0,  # Invalid - should trigger warning
            "another_invalid": 1,  # Invalid - should trigger warning
        }

        # Test for the warning about invalid keys
        with self.assertWarns(UserWarning) as cm:
            check_device_map(model, device_map)

        warning_msg = str(cm.warning)
        self.assertIn("device_map keys do not match any submodules", warning_msg)
        self.assertIn("invalid_module", warning_msg)
        self.assertIn("another_invalid", warning_msg)

    def shard_test_model(self, model, tmp_dir):
        module_index = {
            "linear1": "checkpoint_part1.bin",
            "batchnorm": "checkpoint_part2.bin",
            "linear2": "checkpoint_part3.bin",
        }
        index = {}
        for name, _ in model.state_dict().items():
            module = name.split(".")[0]
            index[name] = module_index[module]

        with open(os.path.join(tmp_dir, "weight_map.index.json"), "w") as f:
            json.dump(index, f)

        for module, fname in module_index.items():
            state_dict = {k: v for k, v in model.state_dict().items() if k.startswith(module)}
            full_fname = os.path.join(tmp_dir, fname)
            torch.save(state_dict, full_fname)

    def test_load_checkpoint_in_model(self):
        # Check with whole checkpoint
        model = ModelForTest()
        with tempfile.TemporaryDirectory() as tmp_dir:
            fname = os.path.join(tmp_dir, "pt_model.bin")
            torch.save(model.state_dict(), fname)
            load_checkpoint_in_model(model, fname)

        # Check with sharded index
        model = ModelForTest()
        with tempfile.TemporaryDirectory() as tmp_dir:
            self.shard_test_model(model, tmp_dir)
            index_file = os.path.join(tmp_dir, "weight_map.index.json")
            load_checkpoint_in_model(model, index_file)

        # Check with sharded checkpoint
        model = ModelForTest()
        with tempfile.TemporaryDirectory() as tmp_dir:
            self.shard_test_model(model, tmp_dir)
            load_checkpoint_in_model(model, tmp_dir)

    @require_non_cpu
    def test_load_checkpoint_in_model_one_gpu(self):
        device_map = {"linear1": 0, "batchnorm": "cpu", "linear2": "cpu"}

        # Check with whole checkpoint
        model = ModelForTest()
        with tempfile.TemporaryDirectory() as tmp_dir:
            fname = os.path.join(tmp_dir, "pt_model.bin")
            torch.save(model.state_dict(), fname)
            load_checkpoint_in_model(model, fname, device_map=device_map)
        assert model.linear1.weight.device == torch.device(torch_device)
        assert model.batchnorm.weight.device == torch.device("cpu")
        assert model.linear2.weight.device == torch.device("cpu")

        # Check with sharded index
        model = ModelForTest()
        with tempfile.TemporaryDirectory() as tmp_dir:
            self.shard_test_model(model, tmp_dir)
            index_file = os.path.join(tmp_dir, "weight_map.index.json")
            load_checkpoint_in_model(model, index_file, device_map=device_map)

        assert model.linear1.weight.device == torch.device(torch_device)
        assert model.batchnorm.weight.device == torch.device("cpu")
        assert model.linear2.weight.device == torch.device("cpu")

        # Check with sharded checkpoint folder
        model = ModelForTest()
        with tempfile.TemporaryDirectory() as tmp_dir:
            self.shard_test_model(model, tmp_dir)
            load_checkpoint_in_model(model, tmp_dir, device_map=device_map)

        assert model.linear1.weight.device == torch.device(torch_device)
        assert model.batchnorm.weight.device == torch.device("cpu")
        assert model.linear2.weight.device == torch.device("cpu")

    @require_non_cpu
    def test_load_checkpoint_in_model_disk_offload(self):
        device_map = {"linear1": "cpu", "batchnorm": "disk", "linear2": "cpu"}

        model = ModelForTest()
        with tempfile.TemporaryDirectory() as tmp_dir:
            fname = os.path.join(tmp_dir, "pt_model.bin")
            torch.save(model.state_dict(), fname)
            load_checkpoint_in_model(model, fname, device_map=device_map, offload_folder=tmp_dir)
        assert model.linear1.weight.device == torch.device("cpu")
        assert model.batchnorm.weight.device == torch.device("meta")
        # Buffers are not offloaded by default
        assert model.batchnorm.running_mean.device == torch.device("cpu")
        assert model.linear2.weight.device == torch.device("cpu")

        model = ModelForTest()
        with tempfile.TemporaryDirectory() as tmp_dir:
            fname = os.path.join(tmp_dir, "pt_model.bin")
            torch.save(model.state_dict(), fname)
            load_checkpoint_in_model(model, fname, device_map=device_map, offload_folder=tmp_dir, offload_buffers=True)
        assert model.linear1.weight.device == torch.device("cpu")
        assert model.batchnorm.weight.device == torch.device("meta")
        assert model.batchnorm.running_mean.device == torch.device("meta")
        assert model.linear2.weight.device == torch.device("cpu")

    @require_non_hpu  # hpu does not support device indexing "hpu:1"
    @require_multi_device
    def test_load_checkpoint_in_model_two_gpu(self):
        device_map = {"linear1": 0, "batchnorm": "cpu", "linear2": 1}

        # Check with whole checkpoint
        model = ModelForTest()
        with tempfile.TemporaryDirectory() as tmp_dir:
            fname = os.path.join(tmp_dir, "pt_model.bin")
            torch.save(model.state_dict(), fname)
            load_checkpoint_in_model(model, fname, device_map=device_map)
        assert model.linear1.weight.device == torch.device(torch_device)
        assert model.batchnorm.weight.device == torch.device("cpu")
        assert model.linear2.weight.device == torch.device(torch_device.replace("0", "1"))

        # Check with sharded index
        model = ModelForTest()
        with tempfile.TemporaryDirectory() as tmp_dir:
            self.shard_test_model(model, tmp_dir)
            index_file = os.path.join(tmp_dir, "weight_map.index.json")
            load_checkpoint_in_model(model, index_file, device_map=device_map)

        assert model.linear1.weight.device == torch.device(torch_device)
        assert model.batchnorm.weight.device == torch.device("cpu")
        assert model.linear2.weight.device == torch.device(torch_device.replace("0", "1"))

        # Check with sharded checkpoint
        model = ModelForTest()
        with tempfile.TemporaryDirectory() as tmp_dir:
            self.shard_test_model(model, tmp_dir)
            load_checkpoint_in_model(model, tmp_dir, device_map=device_map)

        assert model.linear1.weight.device == torch.device(torch_device)
        assert model.batchnorm.weight.device == torch.device("cpu")
        assert model.linear2.weight.device == torch.device(torch_device.replace("0", "1"))

    def test_load_checkpoint_in_model_dtype(self):
        with tempfile.NamedTemporaryFile(suffix=".pt") as tmpfile:
            model = ModelSeveralDtypes()
            torch.save(model.state_dict(), tmpfile.name)

            new_model = ModelSeveralDtypes()
            load_checkpoint_in_model(
                new_model, tmpfile.name, offload_state_dict=True, dtype=torch.float16, device_map={"": "cpu"}
            )

            assert new_model.int_param.dtype == torch.int64
            assert new_model.float_param.dtype == torch.float16

    @parameterized.expand([(None,), ({"": "cpu"},)])
    def test_load_checkpoint_in_model_unexpected_keys(self, device_map: Optional[dict]):
        model = ModelForTest()

        state_dict = model.state_dict()
        state_dict["foo"] = torch.rand(4, 5)
        with tempfile.NamedTemporaryFile(suffix=".pt") as tmpfile:
            torch.save(state_dict, tmpfile)

            model = ModelForTest()

            with self.assertLogs() as cm:
                load_checkpoint_in_model(model, tmpfile.name, device_map=device_map)

                self.assertTrue(any("were not used when" in out for out in cm.output))

            with self.assertRaises((ValueError, RuntimeError)):
                load_checkpoint_in_model(model, tmpfile.name, device_map=device_map, strict=True)

    def test_clean_device_map(self):
        # Regroup everything if all is on the same device
        assert clean_device_map({"a": 0, "b": 0, "c": 0}) == {"": 0}
        # Regroups children of level 1 on the same device
        assert clean_device_map({"a.x": 0, "a.y": 0, "b.x": 1, "b.y": 1, "c": 1}) == {"a": 0, "b": 1, "c": 1}
        # Regroups children of level 2 on the same device
        assert clean_device_map({"a.x": 0, "a.y": 0, "b.x.0": 1, "b.x.1": 1, "b.y.0": 2, "b.y.1": 2, "c": 2}) == {
            "a": 0,
            "b.x": 1,
            "b.y": 2,
            "c": 2,
        }

    def test_infer_auto_device_map(self):
        model = ModelForTest()
        # model has size 236: linear1 64, batchnorm 72, linear2 100
        try:
            with self.assertLogs() as cm:
                device_map = infer_auto_device_map(model, max_memory={0: 200, 1: 200})
                self.assertFalse(any("insufficient memory" in out for out in cm.output))
        except AssertionError:
            # No logs exist; test passes implicitly
            pass

        # only linear1 fits on device 0 as we keep memory available for the maximum layer in case of offload
        assert device_map == {"linear1": 0, "batchnorm": 1, "linear2": 1}

        device_map = infer_auto_device_map(model, max_memory={0: 200, 1: 172, 2: 200})
        # On device 1, we don't care about keeping size available for the max layer, so even if there is just the
        # size available for batchnorm + linear2, they fit here.
        assert device_map == {"linear1": 0, "batchnorm": 1, "linear2": 1}

        model.linear1.weight = model.linear2.weight
        device_map = infer_auto_device_map(model, max_memory={0: 200, 1: 200})
        # By tying weights, the whole model fits on device 0
        assert device_map == {"": 0}

        # When splitting a bigger model, the split is done at the layer level
        model = nn.Sequential(ModelForTest(), ModelForTest(), ModelForTest())
        device_map = infer_auto_device_map(model, max_memory={0: 500, 1: 500})
        assert device_map == {"0": 0, "1.linear1": 0, "1.batchnorm": 0, "1.linear2": 1, "2": 1}

        # With no_split_module_classes, it's done at that module level
        model = nn.Sequential(ModelForTest(), ModelForTest(), ModelForTest())
        device_map = infer_auto_device_map(
            model, max_memory={0: 500, 1: 500}, no_split_module_classes=["ModelForTest"]
        )
        assert device_map == {"0": 0, "1": 1, "2": 1}

    def test_infer_auto_device_map_with_tied_weights(self):
        model = nn.Sequential(
            OrderedDict([("layer1", ModelForTest()), ("layer2", ModelForTest()), ("layer3", ModelForTest())])
        )
        model.layer3.linear2.weight = model.layer1.linear2.weight
        device_map = infer_auto_device_map(model, max_memory={0: 400, 1: 500})
        expected = {"layer1": 0, "layer3.linear2": 0, "layer2": 1, "layer3.linear1": 1, "layer3.batchnorm": 1}
        assert device_map == expected

        # With three weights tied together
        model.layer2.linear2.weight = model.layer1.linear2.weight
        device_map = infer_auto_device_map(model, max_memory={0: 400, 1: 500})
        expected = {
            "layer1": 0,
            "layer2.linear2": 0,
            "layer3.linear2": 0,
            "layer2.linear1": 1,
            "layer2.batchnorm": 1,
            "layer3.linear1": 1,
            "layer3.batchnorm": 1,
        }
        assert device_map == expected

        # With two groups of weights tied together
        model.layer2.linear1.weight = model.layer1.linear1.weight
        device_map = infer_auto_device_map(model, max_memory={0: 400, 1: 500})
        expected = {
            "layer1": 0,
            "layer2.linear1": 0,
            "layer2.linear2": 0,
            "layer3.linear2": 0,
            "layer2.batchnorm": 1,
            "layer3.linear1": 1,
            "layer3.batchnorm": 1,
        }
        assert device_map == expected

        # With weights ties in the same module
        model = nn.Sequential(
            OrderedDict(
                [
                    ("linear1", nn.Linear(4, 4)),
                    ("linear2", nn.Linear(6, 6)),
                    ("linear3", nn.Linear(4, 4)),
                    ("linear4", nn.Linear(6, 6)),
                ]
            )
        )
        model.linear3.weight = model.linear1.weight
        model.linear3.bias = model.linear1.bias
        device_map = infer_auto_device_map(model, max_memory={0: 250, 1: 400})
        expected = {"linear1": 0, "linear2": 1, "linear3": 0, "linear4": 1}
        assert device_map == expected

        # With tied weights sharing a same prefix name (`compute.weight` vs `compute.weight_submodule.parameter`)
        class SubModule(torch.nn.Module):
            def __init__(self, ref_to_parameter):
                super().__init__()
                self.parameter = ref_to_parameter

            def forward(self, x):
                return self.x + torch.max(self.parameter)

        class LinearModuleAndSubModule(torch.nn.Linear):
            def __init__(self, in_features, out_features):
                super().__init__(in_features, out_features)
                self.weight_submodule = SubModule(self.weight)

            def forward(self, x):
                return torch.nn.functional.linear(self.weight_submodule(x), self.weight)

        class Model(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.compute = LinearModuleAndSubModule(3, 8)

            def forward(self, x):
                return self.compute(x)

        model = Model()

        device_memory = {0: 4, "cpu": 96000}  # Low memory device, just to force splitting and trigger the error
        infer_auto_device_map(model, device_memory)

    @require_huggingface_suite
    def test_infer_auto_device_map_on_t0pp(self):
        from transformers import AutoConfig, AutoModelForSeq2SeqLM

        config = AutoConfig.from_pretrained("bigscience/T0pp")
        with init_empty_weights():
            model = AutoModelForSeq2SeqLM.from_config(config)
        model.tie_weights()

        special_dtypes = {n: torch.float32 for n, _ in model.named_parameters() if "wo" in n}
        max_memory = {0: 10**10, 1: 10**10, "cpu": 10**10}
        device_map = infer_auto_device_map(
            model,
            no_split_module_classes=["T5Block"],
            dtype=torch.float16,
            max_memory=max_memory,
            special_dtypes=special_dtypes,
        )

        # The 3 tied weights should all be on device 0
        assert device_map["shared"] == 0
        assert device_map["encoder.embed_tokens"] == 0
        assert device_map["decoder.embed_tokens"] == 0

    def test_infer_auto_device_map_with_buffer_check(self):
        model = ModelForTest()
        model.linear1.register_buffer("test_buffer1", torch.zeros(10, 2))
        model.batchnorm.register_buffer("test_buffer2", torch.zeros(10, 3))
        model.linear2.register_buffer("test_buffer3", torch.zeros(10, 3))
        # model has size 236(parameters) + 360(buffers): linear1 64 + 80, batchnorm 72 + 160, linear2 100 + 120

        # Only linear1 (144) fits on device 0, and remaining buffers (batchnorm's 160 + linear2's 120 = 280) won't fit
        # device 0, because they will also be loaded to device 0 all at once when inferencing without offload_buffers
        # Should print a warning as intended in such case
        with self.assertWarns(Warning):
            device_map = infer_auto_device_map(model, max_memory={0: 400, "cpu": "1GB"})
        assert device_map == {"linear1": 0, "batchnorm": "cpu", "linear2": "cpu"}

        # Only linear1 (144) fits on device 0, and remaining buffers (batchnorm's 160 + linear2's 120 = 280) won't fit
        # device 0, but with offload_buffers they won't be loaded to device 0 all at once, so it's ok now
        # Should NOT print a warning in such case
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter("always")
            device_map = infer_auto_device_map(model, max_memory={0: 400, "cpu": "1GB"}, offload_buffers=True)
        assert len(w) == 0
        assert device_map == {"linear1": 0, "batchnorm": "cpu", "linear2": "cpu"}

    def test_infer_auto_device_map_with_buffer_check_and_multi_devices(self):
        model = ModelForTest()
        model.linear1.register_buffer("test_buffer1", torch.zeros(10, 2))
        model.batchnorm.register_buffer("test_buffer2", torch.zeros(10, 3))
        model.linear2.register_buffer("test_buffer3", torch.zeros(10, 3))
        model.linear3 = nn.Linear(4, 5)
        model.linear3.register_buffer("test_buffer4", torch.zeros(10, 2))
        # model has size 336(parameters) + 440(buffers): linear1 64 + 80, batchnorm 72 + 160, linear2 100 + 120,
        # linear3 100 + 80

        # Now we have two devices, linear1 will fit on device 0, batchnorm will fit on device 1, and the second device
        # can hold all remaining buffers
        # Should NOT print a warning in such case
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter("always")
            device_map = infer_auto_device_map(model, max_memory={0: 400, 1: 400, "cpu": "1GB"})
        assert len(w) == 0
        assert device_map == {"linear1": 0, "batchnorm": 1, "linear2": "cpu", "linear3": "cpu"}

        # Now we have two devices, but neither the first nor the second device can hold all remaining buffers
        # Should print a warning as intended in such case
        with self.assertWarns(Warning):
            device_map = infer_auto_device_map(model, max_memory={0: 400, 1: 200, "cpu": "1GB"})
        assert device_map == {"linear1": 0, "batchnorm": 1, "linear2": "cpu", "linear3": "cpu"}

        # Now we have two devices, neither can hold all the buffers, but we are using the offload_buffers=True
        # Should NOT print a warning in such case
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter("always")
            device_map = infer_auto_device_map(model, max_memory={0: 400, 1: 200, "cpu": "1GB"}, offload_buffers=True)
        assert len(w) == 0
        assert device_map == {"linear1": 0, "batchnorm": 1, "linear2": "cpu", "linear3": "cpu"}

    def test_infer_auto_device_map_with_fallback_allocation(self):
        # Create a model where modules cannot be allocated without fallback_allocation
        # Define the inner module with its layers
        inner_module = nn.Sequential(
            OrderedDict([("linear1", nn.Linear(10, 4)), ("linear2", nn.Linear(4, 4)), ("linear3", nn.Linear(4, 8))])
        )

        # Wrap the inner module in another module
        model = nn.Sequential(OrderedDict([("module", inner_module)]))

        max_memory = {0: 256}

        # Without fallback_allocation
        with self.assertLogs() as cm:
            device_map = infer_auto_device_map(model, max_memory=max_memory, fallback_allocation=False)
            # No module should be assigned to device 0
            assert all(device != 0 for device in device_map.values())
            # Check for warning about insufficient memory
            self.assertTrue(any("insufficient memory" in out for out in cm.output))

        # With fallback_allocation
        try:
            with self.assertLogs() as cm:
                device_map = infer_auto_device_map(model, max_memory=max_memory, fallback_allocation=True)
                self.assertFalse(any("insufficient memory" in out for out in cm.output))
        except AssertionError:
            # No logs exist; test passes implicitly
            pass
        # At least one submodule should be assigned to device 0
        assert any(device == 0 for device in device_map.values())

        expected_device_map = {"module.linear1": "disk", "module.linear2": 0, "module.linear3": "disk"}
        assert device_map == expected_device_map

    def test_infer_auto_device_map_with_fallback_allocation_no_fit(self):
        # Create a model where even the smallest submodules cannot fit
        inner_module = nn.Sequential(
            OrderedDict(
                [("linear1", nn.Linear(10, 10)), ("linear2", nn.Linear(10, 10)), ("linear3", nn.Linear(10, 10))]
            )
        )

        # Wrap the inner module in another module
        model = nn.Sequential(OrderedDict([("module", inner_module)]))

        max_memory = {0: 30}

        # With fallback_allocation
        try:
            with self.assertLogs() as cm:
                device_map = infer_auto_device_map(model, max_memory=max_memory, fallback_allocation=True)
                # No module should be assigned to device 0
                assert all(device != 0 for device in device_map.values())
                # Check for warning about insufficient memory
                self.assertTrue(any("insufficient memory" in out for out in cm.output))
        except AssertionError:
            # No logs exist; test passes implicitly
            pass

    def test_infer_auto_device_map_with_fallback_allocation_partial_fit(self):
        # Create a model with deeper hierarchy
        class CustomModule(nn.Module):
            def __init__(self):
                super().__init__()
                self.submodule1 = nn.Linear(20, 20)
                self.submodule2 = nn.Linear(20, 20)

        model = nn.Sequential(
            OrderedDict([("module1", CustomModule()), ("module2", CustomModule()), ("module3", CustomModule())])
        )

        max_memory = {0: 5000}

        # With fallback_allocation
        device_map = infer_auto_device_map(model, max_memory=max_memory, fallback_allocation=True)
        # Check that at least some parameters are assigned to device 0
        assigned_to_device_0 = [name for name, device in device_map.items() if device == 0]
        assert len(assigned_to_device_0) > 0

    def test_infer_auto_device_map_with_fallback_allocation_tied_weights(self):
        # Create a model with tied weights
        class TiedWeightsModel(nn.Module):
            def __init__(self):
                super().__init__()
                self.linear1 = nn.Linear(10, 10)
                self.linear2 = nn.Linear(10, 10)
                self.linear2.weight = self.linear1.weight

        model = TiedWeightsModel()

        max_memory = {0: 600}

        # With fallback_allocation
        device_map = infer_auto_device_map(model, max_memory=max_memory, fallback_allocation=True)
        # Check that tied modules are assigned correctly
        expected_device_map = {"": 0}
        assert device_map == expected_device_map

    def test_infer_auto_device_map_with_fallback_allocation_and_buffers(self):
        # Create a model with buffers
        model = nn.Sequential(
            OrderedDict(
                [("linear1", nn.Linear(10, 10)), ("batchnorm", nn.BatchNorm1d(10)), ("linear2", nn.Linear(10, 10))]
            )
        )
        model.linear1.register_buffer("buffer1", torch.zeros(5))
        model.batchnorm.register_buffer("buffer2", torch.zeros(5))
        model.linear2.register_buffer("buffer3", torch.zeros(5))

        max_memory = {0: 678}

        # With fallback_allocation and offload_buffers=False
        with self.assertWarns(Warning) as cm:
            device_map = infer_auto_device_map(
                model, max_memory=max_memory, fallback_allocation=True, offload_buffers=False
            )

        # Check that the warning contains the expected message
        warning_message = str(cm.warning)
        assert "offload_buffers" in warning_message or "Current model requires" in warning_message

        # Verify that the entire model is assigned to device 0
        expected_device_map = {"batchnorm": 0, "linear1": "disk", "linear2": "disk"}
        assert device_map == expected_device_map

    @require_non_cpu
    def test_get_balanced_memory(self):
        model = ModelForTest()
        # model has size 236: linear1 64, batchnorm 72, linear2 100
        max_memory = get_balanced_memory(model, max_memory={0: 200, 1: 200})
        assert {0: 200, 1: 200} == max_memory

        # We should be able to set models on a non-contiguous sub-set of
        max_memory = get_balanced_memory(model, max_memory={0: 200, 2: 200})
        assert {0: 200, 2: 200} == max_memory

        max_memory = get_balanced_memory(model, max_memory={0: 300, 1: 300})
        assert {0: 215, 1: 300} == max_memory

        # Last device always get max memory to give more buffer and avoid accidental CPU offload
        max_memory = get_balanced_memory(model, max_memory={0: 300, 1: 500})
        assert {0: 215, 1: 500} == max_memory

        # Last device always get max memory to give more buffer, even if CPU is provided
        max_memory = get_balanced_memory(model, max_memory={0: 300, "cpu": 1000})
        assert {0: 300, "cpu": 1000} == max_memory

        # If we set a device to 0, it's not counted.
        max_memory = get_balanced_memory(model, max_memory={0: 0, 1: 300, 2: 300})
        assert {0: 0, 1: 215, 2: 300} == max_memory

        # If we set a device to 0, it's not counted.
        max_memory = get_balanced_memory(model, max_memory={0: 0, "cpu": 100})
        assert {0: 0, "cpu": 100} == max_memory

    # Tests that get_module_size_with_ties returns the correct tied modules in
    # models with tied parameters whose parent modules share the same name prefix
    # See issue #3308: https://github.com/huggingface/accelerate/issues/3308
    def test_get_module_size_with_ties(self):
        # Create a model with a ModuleList containing more than 10 elements
        # so the names of some layers share the same prefix, e.g. "1" and "10"
        num_layers = 15
        model = nn.ModuleList([nn.Linear(10, 10) for _ in range(num_layers)])
        # Tie .weight for all the layers
        for i in range(1, num_layers):
            model[i].weight = model[i - 1].weight
        # Each tied parameter group is sorted in alphabetical ordering,
        # mimicking the output of find_tied_parameters
        tied_parameters = [sorted([f"{i}.weight" for i in range(num_layers)])]
        # Compute module sizes
        weight_size, bias_size = (
            model[0].weight.element_size() * model[0].weight.numel(),
            model[0].bias.element_size() * model[0].bias.numel(),
        )
        module_sizes = dict(
            **{"": num_layers * (weight_size + bias_size)},
            **{f"{i}": (weight_size + bias_size) for i in range(num_layers)},
            **{f"{i}.weight": weight_size for i in range(num_layers)},
            **{f"{i}.bias": bias_size for i in range(num_layers)},
        )
        # Simulate the input for get_module_size_with_ties when invoked from infer_auto_device_map
        # when the first module in model is being processed
        modules_to_treat = list(model.named_children())[1:]
        tied_params = tied_parameters[0][1:]
        module_size = weight_size + bias_size

        module_size_with_ties, tied_module_names, tied_modules = get_module_size_with_ties(
            tied_params, module_size, module_sizes, modules_to_treat
        )
        # The expected lists are ordered using as key the module names, to follow
        # the same order as the tied_parameters returned by find_tied_parameters
        expected_tied_module_names, expected_tied_modules = map(
            list, zip(*sorted(modules_to_treat, key=lambda x: x[0]))
        )

        assert module_size_with_ties == module_size + (num_layers - 1) * bias_size
        assert tied_module_names == expected_tied_module_names
        assert tied_modules == expected_tied_modules

    @require_non_cpu
    def test_load_state_dict(self):
        state_dict = {k: torch.randn(4, 5) for k in ["a", "b", "c"]}
        device_maps = [{"a": "cpu", "b": 0, "c": "disk"}, {"a": 0, "b": 0, "c": "disk"}, {"a": 0, "b": 0, "c": 0}]

        for device_map in device_maps:
            with tempfile.TemporaryDirectory() as tmp_dir:
                checkpoint_file = os.path.join(tmp_dir, "model.safetensors")
                save_file(state_dict, checkpoint_file, metadata={"format": "pt"})

                loaded_state_dict = load_state_dict(checkpoint_file, device_map=device_map)

            for param, device in device_map.items():
                device = device if device != "disk" else "cpu"
                assert loaded_state_dict[param].device == torch.device(device)

    def test_convert_file_size(self):
        result = convert_file_size_to_int("0MB")
        assert result == 0

        result = convert_file_size_to_int("100MB")
        assert result == (100 * (10**6))

        result = convert_file_size_to_int("2GiB")
        assert result == (2 * (2**30))

        result = convert_file_size_to_int("512KiB")
        assert result == (512 * (2**10))

        result = convert_file_size_to_int("1.5GB")
        assert result == (1.5 * (10**9))

        result = convert_file_size_to_int("100KB")
        assert result == (100 * (10**3))

        result = convert_file_size_to_int(500)
        assert result == 500

        with self.assertRaises(ValueError):
            convert_file_size_to_int("5MBB")

        with self.assertRaises(ValueError):
            convert_file_size_to_int("5k0MB")

        with self.assertRaises(ValueError):
            convert_file_size_to_int("-1GB")

    def test_get_state_dict_offloaded_model(self):
        for model_cls in (ModelForTest, NestedModelForTest):
            model = model_cls()
            execution_device = torch.device(torch_device)
            original_state_dict = model.state_dict()

            cpu_offload(model, execution_device=execution_device)
            state_dict = get_state_dict_offloaded_model(model)

            assert original_state_dict.keys() == state_dict.keys()
            for key in original_state_dict:
                assert torch.equal(original_state_dict[key], state_dict[key])

    def test_align_module_device_simple(self):
        model = ModelForTest()
        execution_device = torch.device(torch_device)
        model_device = torch.device("cpu")

        # test default execution device
        with align_module_device(model.batchnorm):
            assert model.linear1.weight.device == model_device
            assert model.batchnorm.weight.device == model_device
            assert model.linear2.weight.device == model_device
        assert model.linear1.weight.device == model_device
        assert model.batchnorm.weight.device == model_device
        assert model.linear2.weight.device == model_device

        # test with explicit execution device
        with align_module_device(model.batchnorm, execution_device=execution_device):
            assert model.linear1.weight.device == model_device
            assert model.batchnorm.weight.device == execution_device
            assert model.linear2.weight.device == model_device
        assert model.linear1.weight.device == model_device
        assert model.batchnorm.weight.device == model_device
        assert model.linear2.weight.device == model_device

    def test_align_module_device_offloaded(self):
        model = ModelForTest()
        execution_device = torch.device(torch_device)
        offload_device = torch.device("meta")
        cpu_offload(model, execution_device=execution_device)

        # test default execution device
        with align_module_device(model.batchnorm):
            assert model.linear1.weight.device == offload_device
            assert model.batchnorm.weight.device == execution_device
            assert model.linear2.weight.device == offload_device
        assert model.linear1.weight.device == offload_device
        assert model.batchnorm.weight.device == offload_device
        assert model.linear2.weight.device == offload_device

        # test with explicit execution device
        with align_module_device(model.batchnorm, execution_device="cpu"):
            assert model.linear1.weight.device == offload_device
            assert model.batchnorm.weight.device == torch.device("cpu")
            assert model.linear2.weight.device == offload_device
        assert model.linear1.weight.device == offload_device
        assert model.batchnorm.weight.device == offload_device
        assert model.linear2.weight.device == offload_device

    def test_align_module_device_offloaded_nested(self):
        model = NestedModelForTest()
        execution_device = torch.device(torch_device)
        align_device = torch.device("cpu")
        cpu_offload(model, execution_device=execution_device)
        for module in model.modules():
            with align_module_device(module, align_device):
                for param in model.parameters(recurse=False):
                    assert param.device == align_device

    def test_extract_model_from_parallel_partial_compile(self):
        """Partial torch.compile on a submodule should not crash and should preserve the compiled wrapper."""
        model = ModelForTest()
        model.linear2 = torch.compile(model.linear2)

        # Precondition: top is not compiled, only submodule is
        assert not hasattr(model, "_orig_mod")
        assert hasattr(model.linear2, "_orig_mod")

        # Standard extraction
        extracted = extract_model_from_parallel(model)
        x = torch.randn(2, 3)
        torch.testing.assert_close(model(x), extracted(x))
        assert isinstance(extracted, ModelForTest)
        assert hasattr(extracted.linear2, "_orig_mod")

        # Extraction with keep_torch_compile=False
        extracted_no_keep = extract_model_from_parallel(model, keep_torch_compile=False)
        assert hasattr(extracted_no_keep.linear2, "_orig_mod")
        torch.testing.assert_close(model(x), extracted_no_keep(x))


================================================
FILE: tests/test_multidevice.py
================================================
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import inspect
import unittest
from unittest import skip

import torch

from accelerate import Accelerator
from accelerate.big_modeling import dispatch_model
from accelerate.test_utils import (
    DEFAULT_LAUNCH_COMMAND,
    assert_exception,
    device_count,
    execute_subprocess_async,
    get_launch_command,
    path_in_accelerate_package,
    require_huggingface_suite,
    require_multi_device,
    require_non_torch_xla,
    require_pippy,
    require_torchvision,
    run_first,
    torch_device,
)
from accelerate.utils import is_hpu_available, patch_environment


class MultiDeviceTester(unittest.TestCase):
    test_file_path = path_in_accelerate_package("test_utils", "scripts", "test_script.py")
    data_loop_file_path = path_in_accelerate_package("test_utils", "scripts", "test_distributed_data_loop.py")
    operation_file_path = path_in_accelerate_package("test_utils", "scripts", "test_ops.py")
    pippy_file_path = path_in_accelerate_package("test_utils", "scripts", "external_deps", "test_pippy.py")
    merge_weights_file_path = path_in_accelerate_package("test_utils", "scripts", "test_merge_weights.py")

    @run_first
    @require_multi_device
    def test_multi_device(self):
        print(f"Found {device_count} {torch_device} devices.")
        cmd = DEFAULT_LAUNCH_COMMAND + [self.test_file_path]
        with patch_environment(omp_num_threads=1):
            execute_subprocess_async(cmd)

    @run_first
    @require_multi_device
    def test_multi_device_ops(self):
        print(f"Found {device_count} {torch_device} devices.")
        cmd = DEFAULT_LAUNCH_COMMAND + [self.operation_file_path]
        with patch_environment(omp_num_threads=1):
            execute_subprocess_async(cmd)

    @run_first
    @require_multi_device
    def test_pad_across_processes(self):
        print(f"Found {device_count} {torch_device} devices.")
        cmd = DEFAULT_LAUNCH_COMMAND + [inspect.getfile(self.__class__)]
        with patch_environment(omp_num_threads=1):
            execute_subprocess_async(cmd)

    @run_first
    @require_multi_device
    def test_multi_device_merge_fsdp_weights(self):
        print(f"Found {device_count} {torch_device} devices.")
        cmd = DEFAULT_LAUNCH_COMMAND + [self.merge_weights_file_path]

        env_kwargs = dict(omp_num_threads=1)
        with patch_environment(**env_kwargs):
            execute_subprocess_async(cmd)

    @run_first
    @require_non_torch_xla
    @require_multi_device
    def test_distributed_data_loop(self):
        """
        This TestCase checks the behaviour that occurs during distributed training or evaluation,
        when the batch size does not evenly divide the dataset size.
        """
        print(f"Found {device_count} devices, using 2 devices only")
        cmd = get_launch_command(num_processes=2) + [self.data_loop_file_path]

        env_kwargs = dict(omp_num_threads=1)
        if torch_device == "xpu":
            env_kwargs.update(ze_affinity_mask="0,1")
        elif torch_device == "npu":
            env_kwargs.update(ascend_rt_visible_devices="0,1")
        elif torch_device == "mlu":
            env_kwargs.update(mlu_visible_devices="0,1")
        elif torch_device == "sdaa":
            env_kwargs.update(sdaa_visible_devices="0,1")
        else:
            env_kwargs.update(cuda_visible_devices="0,1")

        with patch_environment(**env_kwargs):
            execute_subprocess_async(cmd)

    @run_first
    @require_pippy
    @require_torchvision
    @require_multi_device
    @require_huggingface_suite
    @skip("Will soon deprecate pippy")
    def test_pippy(self):
        """
        Checks the integration with the pippy framework
        """
        print(f"Found {device_count} {torch_device} devices")
        cmd = get_launch_command(multi_gpu=True, num_processes=device_count) + [self.pippy_file_path]
        with patch_environment(omp_num_threads=1):
            execute_subprocess_async(cmd)


if __name__ == "__main__":
    accelerator = Accelerator()
    shape = (accelerator.state.process_index + 2, 10)
    tensor = torch.randint(0, 10, shape).to(accelerator.device)

    error_msg = ""

    tensor1 = accelerator.pad_across_processes(tensor)
    if tensor1.shape[0] != accelerator.state.num_processes + 1:
        error_msg += f"Found shape {tensor1.shape} but should have {accelerator.state.num_processes + 1} at dim 0."
    index = accelerator.state.process_index + 2
    if not torch.equal(tensor1[:index], tensor):
        error_msg += "Tensors have different values."
    if not torch.all(tensor1[index:] == 0):
        error_msg += "Padding was not done with the right value (0)."

    tensor2 = accelerator.pad_across_processes(tensor.clone(), pad_first=True)
    if tensor2.shape[0] != accelerator.state.num_processes + 1:
        error_msg += f"Found shape {tensor2.shape} but should have {accelerator.state.num_processes + 1} at dim 0."
    index = accelerator.state.num_processes - accelerator.state.process_index - 1
    if not torch.equal(tensor2[index:], tensor):
        error_msg += "Tensors have different values."
    if not torch.all(tensor2[:index] == 0):
        error_msg += "Padding was not done with the right value (0)."

    # Raise error at the end to make sure we don't stop at the first failure.
    if len(error_msg) > 0:
        raise ValueError(error_msg)

    # Check device_map
    accelerator.print("Test `device_map` cannot be prepared.")

    class ModelForTest(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.linear1 = torch.nn.Linear(3, 4)
            self.batchnorm = torch.nn.BatchNorm1d(4)
            self.linear2 = torch.nn.Linear(4, 5)

        def forward(self, x):
            return self.linear2(self.batchnorm(self.linear1(x)))

    if is_hpu_available():
        device_map = {"linear1": 0, "batchnorm": "cpu", "linear2": 0}
    else:
        device_map = {"linear1": 0, "batchnorm": "cpu", "linear2": 1}

    model = ModelForTest()
    dispatch_model(model, device_map=device_map)
    with assert_exception(ValueError, "You can't train a model that has been loaded with"):
        model = accelerator.prepare_model(model)


================================================
FILE: tests/test_offload.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import unittest
from tempfile import TemporaryDirectory

import torch
import torch.nn as nn

from accelerate.utils import (
    OffloadedWeightsLoader,
    extract_submodules_state_dict,
    load_offloaded_weight,
    offload_state_dict,
    offload_weight,
)


class ModelForTest(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = nn.Linear(3, 4)
        self.batchnorm = nn.BatchNorm1d(4)
        self.linear2 = nn.Linear(4, 5)

    def forward(self, x):
        return self.linear2(self.batchnorm(self.linear1(x)))


class OffloadTester(unittest.TestCase):
    def test_offload_state_dict(self):
        model = ModelForTest()
        with TemporaryDirectory() as tmp_dir:
            offload_state_dict(tmp_dir, model.state_dict())
            index_file = os.path.join(tmp_dir, "index.json")
            assert os.path.isfile(index_file)
            # TODO: add tests on what is inside the index

            for key in ["linear1.weight", "linear1.bias", "linear2.weight", "linear2.bias"]:
                weight_file = os.path.join(tmp_dir, f"{key}.dat")
                assert os.path.isfile(weight_file)
                # TODO: add tests on the fact weights are properly loaded

    def test_offload_weight(self):
        dtypes = [torch.float16, torch.float32, torch.bfloat16]

        for dtype in dtypes:
            weight = torch.randn(2, 3, dtype=dtype)
            with TemporaryDirectory() as tmp_dir:
                index = offload_weight(weight, "weight", tmp_dir, {})
                weight_file = os.path.join(tmp_dir, "weight.dat")
                assert os.path.isfile(weight_file)
                assert index == {"weight": {"shape": [2, 3], "dtype": str(dtype).split(".")[1]}}

                new_weight = load_offloaded_weight(weight_file, index["weight"])
                assert torch.equal(weight, new_weight)

    def test_offload_weights_loader(self):
        model = ModelForTest()
        state_dict = model.state_dict()
        cpu_part = {k: v for k, v in state_dict.items() if "linear2" not in k}
        disk_part = {k: v for k, v in state_dict.items() if "linear2" in k}

        with TemporaryDirectory() as tmp_dir:
            offload_state_dict(tmp_dir, disk_part)
            weight_map = OffloadedWeightsLoader(state_dict=cpu_part, save_folder=tmp_dir)

            # Every key is there with the right value
            assert sorted(weight_map) == sorted(state_dict.keys())
            for key, param in state_dict.items():
                assert torch.allclose(param, weight_map[key])

        cpu_part = {k: v for k, v in state_dict.items() if "weight" in k}
        disk_part = {k: v for k, v in state_dict.items() if "weight" not in k}

        with TemporaryDirectory() as tmp_dir:
            offload_state_dict(tmp_dir, disk_part)
            weight_map = OffloadedWeightsLoader(state_dict=cpu_part, save_folder=tmp_dir)

            # Every key is there with the right value
            assert sorted(weight_map) == sorted(state_dict.keys())
            for key, param in state_dict.items():
                assert torch.allclose(param, weight_map[key])

        with TemporaryDirectory() as tmp_dir:
            offload_state_dict(tmp_dir, state_dict)
            # Duplicates are removed
            weight_map = OffloadedWeightsLoader(state_dict=cpu_part, save_folder=tmp_dir)

            # Every key is there with the right value
            assert sorted(weight_map) == sorted(state_dict.keys())
            for key, param in state_dict.items():
                assert torch.allclose(param, weight_map[key])

    def test_extract_submodules_state_dict(self):
        state_dict = {"a.1": 0, "a.10": 1, "a.2": 2}
        extracted = extract_submodules_state_dict(state_dict, ["a.1", "a.2"])
        assert extracted == {"a.1": 0, "a.2": 2}

        state_dict = {"a.1.a": 0, "a.10.a": 1, "a.2.a": 2}
        extracted = extract_submodules_state_dict(state_dict, ["a.1", "a.2"])
        assert extracted == {"a.1.a": 0, "a.2.a": 2}


================================================
FILE: tests/test_optimizer.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pickle

import torch

from accelerate import Accelerator
from accelerate.test_utils import require_cpu, require_fp16, require_non_cpu
from accelerate.test_utils.testing import AccelerateTestCase


@require_cpu
class CPUOptimizerTester(AccelerateTestCase):
    def test_accelerated_optimizer_pickling(self):
        model = torch.nn.Linear(10, 10)
        optimizer = torch.optim.SGD(model.parameters(), 0.1)
        accelerator = Accelerator()
        optimizer = accelerator.prepare(optimizer)
        try:
            pickle.loads(pickle.dumps(optimizer))
        except Exception as e:
            self.fail(f"Accelerated optimizer pickling failed with {e}")


@require_fp16
@require_non_cpu
class OptimizerTester(AccelerateTestCase):
    def test_accelerated_optimizer_step_was_skipped(self):
        model = torch.nn.Linear(5, 5)
        optimizer = torch.optim.SGD(model.parameters(), 0.1)
        accelerator = Accelerator(mixed_precision="fp16")
        model, optimizer = accelerator.prepare(model, optimizer)

        loss = model(torch.randn(2, 5, device=accelerator.device)).sum()
        accelerator.backward(loss)
        for p in model.parameters():
            # Fake the gradients, as if there's no overflow
            p.grad.fill_(0.01)

        optimizer.step()
        assert optimizer.step_was_skipped is False

        loss = model(torch.randn(2, 5, device=accelerator.device)).sum()
        accelerator.backward(loss)
        for p in model.parameters():
            p.grad.fill_(0.01)
        # Manually set the gradients to be NaN, as if there's an overflow
        p.grad[0] = torch.tensor(float("nan"))

        optimizer.step()
        assert optimizer.step_was_skipped is True

        loss = model(torch.randn(2, 5, device=accelerator.device)).sum()
        accelerator.backward(loss)
        for p in model.parameters():
            p.grad.fill_(0.01)
        # Manually set the gradients to be NaN, as if there's an overflow
        p.grad[0] = torch.tensor(float("nan"))

        optimizer.step()
        assert optimizer.step_was_skipped is True

        loss = model(torch.randn(2, 5, device=accelerator.device)).sum()
        accelerator.backward(loss)
        for p in model.parameters():
            # Fake the gradients, as if there's no overflow
            p.grad.fill_(0.01)

        optimizer.step()
        assert optimizer.step_was_skipped is False


================================================
FILE: tests/test_quantization.py
================================================
# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import tempfile
import unittest

import torch
import torch.nn as nn

from accelerate import Accelerator, init_empty_weights
from accelerate.test_utils import (
    require_bnb,
    require_cuda_or_xpu,
    require_huggingface_suite,
    require_multi_device,
    require_non_torch_xla,
    slow,
)
from accelerate.test_utils.testing import AccelerateTestCase
from accelerate.utils.bnb import load_and_quantize_model
from accelerate.utils.dataclasses import BnbQuantizationConfig
from accelerate.utils.memory import clear_device_cache


class BitsAndBytesConfigIntegration(unittest.TestCase):
    def test_BnbQuantizationConfig(self):
        with self.assertRaises(ValueError):
            BnbQuantizationConfig(load_in_8bit=True, load_in_4bit=True)


@require_non_torch_xla
@slow
@require_cuda_or_xpu
@require_bnb
@require_huggingface_suite
class MixedInt8EmptyModelTest(AccelerateTestCase):
    # We keep the constants inside the init function and model loading inside setUp function

    # We need to test on relatively large models (aka >1b parameters otherwise the quantiztion may not work as expected)
    # Therefore here we use only bloom-1b3 to test our module
    model_name = "marcsun13/bloom-1b7_with_lm_head"

    # Constant values
    # This was obtained on a Quadro RTX 8000 so the number might slightly change
    EXPECTED_RELATIVE_DIFFERENCE = 1.540025

    input_text = "Hello my name is"
    EXPECTED_OUTPUT = "Hello my name is John.\nI am a friend of the family.\n"
    MAX_NEW_TOKENS = 10

    def setUp(self):
        """
        Setup quantized model from empty model
        """
        from huggingface_hub import hf_hub_download
        from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer

        # Models and tokenizer
        self.model_fp16 = AutoModelForCausalLM.from_pretrained(
            self.model_name, torch_dtype=torch.float16, device_map="auto"
        )

        # create model on meta device
        with init_empty_weights():
            self.model_8bit = AutoModelForCausalLM.from_config(AutoConfig.from_pretrained(self.model_name))
        self.model_8bit.tie_weights()

        self.weights_location = hf_hub_download(self.model_name, "pytorch_model.bin")
        self.bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True)

        self.model_8bit = load_and_quantize_model(
            self.model_8bit,
            self.bnb_quantization_config,
            weights_location=self.weights_location,
            device_map={"": 0},
            no_split_module_classes=["BloomBlock"],
        )

        self.tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-1b7")
        self.accelerate = Accelerator()

    def tearDown(self):
        r"""
        TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
        avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
        """
        del self.model_fp16
        del self.model_8bit

        clear_device_cache(garbage_collection=True)

    def test_memory_footprint(self):
        r"""
        A simple test to check if the model conversion has been done correctly by checking on the
        memory footprint of the converted model and the class type of the linear layers of the converted models
        """
        from bitsandbytes.nn import Int8Params

        mem_fp16 = self.model_fp16.get_memory_footprint()
        mem_8bit = self.model_8bit.get_memory_footprint()

        assert round((mem_fp16 / mem_8bit) - self.EXPECTED_RELATIVE_DIFFERENCE, 7) >= 0
        assert self.model_8bit.transformer.h[0].mlp.dense_4h_to_h.weight.__class__ == Int8Params

    def test_linear_are_8bit(self):
        r"""
        A simple test to check if the model conversion has been done correctly by checking on the
        memory footprint of the converted model and the class type of the linear layers of the converted models
        """

        self.model_fp16.get_memory_footprint()
        self.model_8bit.get_memory_footprint()

        for name, module in self.model_8bit.named_modules():
            if isinstance(module, torch.nn.Linear):
                modules_not_converted = (
                    self.bnb_quantization_config.keep_in_fp32_modules + self.bnb_quantization_config.skip_modules
                )
                if name not in modules_not_converted:
                    assert module.weight.dtype == torch.int8

    def test_llm_skip(self):
        r"""
        A simple test to check if `llm_int8_skip_modules` works as expected
        """
        import bitsandbytes as bnb
        from transformers import AutoConfig, AutoModelForCausalLM

        bnb_quantization_config = BnbQuantizationConfig(
            load_in_8bit=True, skip_modules=["lm_head", "transformer.word_embeddings"]
        )

        with init_empty_weights():
            model = AutoModelForCausalLM.from_config(AutoConfig.from_pretrained(self.model_name))

        model.tie_weights()
        model = load_and_quantize_model(
            model,
            bnb_quantization_config,
            weights_location=self.weights_location,
            device_map="auto",
            no_split_module_classes=["BloomBlock"],
        )

        assert model.transformer.h[1].mlp.dense_4h_to_h.weight.dtype == torch.int8
        assert isinstance(model.transformer.h[1].mlp.dense_4h_to_h, bnb.nn.Linear8bitLt)
        assert isinstance(model.lm_head, nn.Linear)
        assert model.lm_head.weight.dtype != torch.int8

    def check_inference_correctness(self, model):
        r"""
        Test the generation quality of the quantized model and see that we are matching the expected output.
        Given that we are operating on small numbers + the testing model is relatively small, we might not get
        the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
        """
        # Check that inference pass works on the model
        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")

        # Check the exactness of the results
        output_parallel = model.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)

        # Get the generation
        output_text = self.tokenizer.decode(output_parallel[0], skip_special_tokens=True)
        assert output_text == self.EXPECTED_OUTPUT

    def test_generate_quality(self):
        self.check_inference_correctness(self.model_8bit)

    def test_fp32_8bit_conversion(self):
        r"""
        Test whether it is possible to mix both `8bit` and `fp32` weights when using `keep_in_fp32_modules` correctly.
        """
        from transformers import AutoConfig, AutoModelForCausalLM

        bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True, keep_in_fp32_modules=["lm_head"])

        with init_empty_weights():
            model = AutoModelForCausalLM.from_config(AutoConfig.from_pretrained(self.model_name))

        model.tie_weights()
        model = load_and_quantize_model(
            model,
            bnb_quantization_config,
            weights_location=self.weights_location,
            device_map="auto",
            no_split_module_classes=["BloomBlock"],
        )
        assert model.lm_head.weight.dtype == torch.float32

    @require_multi_device
    def test_cpu_gpu_loading_custom_device_map(self):
        from bitsandbytes.nn import Int8Params
        from transformers import AutoConfig, AutoModelForCausalLM

        r"""
        A test to check is dispatching a model on cpu & gpu works correctly using a custom `device_map`.
        """
        device_map = {
            "transformer.word_embeddings": "cpu",
            "transformer.word_embeddings_layernorm": 0,
            "lm_head": "cpu",
            "transformer.h.0": "cpu",
            "transformer.h.1": "cpu",
            "transformer.h.2": "cpu",
            "transformer.h.3": 0,
            "transformer.h.4": 0,
            "transformer.h.5": 0,
            "transformer.h.6": 0,
            "transformer.h.7": 0,
            "transformer.h.8": 0,
            "transformer.h.9": 1,
            "transformer.h.10": 0,
            "transformer.h.11": 1,
            "transformer.h.12": 0,
            "transformer.h.13": 0,
            "transformer.h.14": 1,
            "transformer.h.15": 0,
            "transformer.h.16": 0,
            "transformer.h.17": 1,
            "transformer.h.18": 1,
            "transformer.h.19": 0,
            "transformer.h.20": 1,
            "transformer.h.21": 1,
            "transformer.h.22": 0,
            "transformer.h.23": 0,
            "transformer.ln_f": 1,
        }
        bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True)

        with init_empty_weights():
            model_8bit = AutoModelForCausalLM.from_config(AutoConfig.from_pretrained(self.model_name))

        model_8bit.tie_weights()
        model_8bit = load_and_quantize_model(
            model_8bit,
            bnb_quantization_config,
            weights_location=self.weights_location,
            device_map=device_map,
            no_split_module_classes=["BloomBlock"],
        )
        assert model_8bit.transformer.h[0].mlp.dense_4h_to_h.weight.__class__ == Int8Params
        assert model_8bit.transformer.h[1].mlp.dense_4h_to_h.weight.__class__ == Int8Params
        self.check_inference_correctness(model_8bit)

    @require_multi_device
    def test_cpu_gpu_loading_custom_device_map_offload_state_dict(self):
        from bitsandbytes.nn import Int8Params
        from transformers import AutoConfig, AutoModelForCausalLM

        r"""
        A test to check is dispatching a model on cpu & gpu works correctly using a custom `device_map` and offload_state_dict=True.
        """
        device_map = {
            "transformer.word_embeddings": "cpu",
            "transformer.word_embeddings_layernorm": 0,
            "lm_head": "cpu",
            "transformer.h.0": "cpu",
            "transformer.h.1": "cpu",
            "transformer.h.2": "cpu",
            "transformer.h.3": 0,
            "transformer.h.4": 0,
            "transformer.h.5": 0,
            "transformer.h.6": 0,
            "transformer.h.7": 0,
            "transformer.h.8": 0,
            "transformer.h.9": 1,
            "transformer.h.10": 0,
            "transformer.h.11": 1,
            "transformer.h.12": 0,
            "transformer.h.13": 0,
            "transformer.h.14": 1,
            "transformer.h.15": 0,
            "transformer.h.16": 0,
            "transformer.h.17": 1,
            "transformer.h.18": 1,
            "transformer.h.19": 0,
            "transformer.h.20": 1,
            "transformer.h.21": 1,
            "transformer.h.22": 0,
            "transformer.h.23": 0,
            "transformer.ln_f": 1,
        }

        bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True)

        with init_empty_weights():
            model_8bit = AutoModelForCausalLM.from_config(AutoConfig.from_pretrained(self.model_name))

        model_8bit.tie_weights()
        model_8bit = load_and_quantize_model(
            model_8bit,
            bnb_quantization_config,
            weights_location=self.weights_location,
            device_map=device_map,
            no_split_module_classes=["BloomBlock"],
            offload_state_dict=True,
        )
        assert model_8bit.transformer.h[0].mlp.dense_4h_to_h.weight.__class__ == Int8Params
        assert model_8bit.transformer.h[1].mlp.dense_4h_to_h.weight.__class__ == Int8Params
        self.check_inference_correctness(model_8bit)

    @require_multi_device
    def test_cpu_gpu_disk_loading_custom_device_map_kwargs(self):
        from bitsandbytes.nn import Int8Params
        from transformers import AutoConfig, AutoModelForCausalLM

        r"""
        A test to check is dispatching a model on cpu & gpu works correctly using a custom `device_map`.
        This time we also add `disk` on the device_map - using the kwargs directly instead of the quantization config
        """
        device_map = {
            "transformer.word_embeddings": "cpu",
            "transformer.word_embeddings_layernorm": 0,
            "lm_head": "cpu",
            "transformer.h.0": "cpu",
            "transformer.h.1": "cpu",
            "transformer.h.2": "cpu",
            "transformer.h.3": "disk",
            "transformer.h.4": "disk",
            "transformer.h.5": "disk",
            "transformer.h.6": 0,
            "transformer.h.7": 0,
            "transformer.h.8": 0,
            "transformer.h.9": 1,
            "transformer.h.10": 0,
            "transformer.h.11": 1,
            "transformer.h.12": 0,
            "transformer.h.13": 0,
            "transformer.h.14": 1,
            "transformer.h.15": 0,
            "transformer.h.16": 0,
            "transformer.h.17": 1,
            "transformer.h.18": 1,
            "transformer.h.19": 0,
            "transformer.h.20": 1,
            "transformer.h.21": 1,
            "transformer.h.22": 0,
            "transformer.h.23": 0,
            "transformer.ln_f": 1,
        }
        bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True)

        with init_empty_weights():
            model_8bit = AutoModelForCausalLM.from_config(AutoConfig.from_pretrained(self.model_name))
        model_8bit.tie_weights()

        with tempfile.TemporaryDirectory() as tmpdirname:
            model_8bit = load_and_quantize_model(
                model_8bit,
                bnb_quantization_config,
                weights_location=self.weights_location,
                device_map=device_map,
                no_split_module_classes=["BloomBlock"],
                offload_folder=tmpdirname,
                offload_state_dict=True,
            )
            assert model_8bit.transformer.h[4].mlp.dense_4h_to_h.weight.__class__ == Int8Params
            assert model_8bit.transformer.h[5].mlp.dense_4h_to_h.weight.__class__ == Int8Params
            self.check_inference_correctness(model_8bit)

    def test_int8_serialization(self):
        r"""
        Test whether it is possible to serialize a model in 8-bit.
        """
        from bitsandbytes.nn import Int8Params
        from transformers import AutoConfig, AutoModelForCausalLM

        with tempfile.TemporaryDirectory() as tmpdirname:
            # saving state dict for now but will save config and other in the future
            self.accelerate.save_model(self.model_8bit, tmpdirname)

            with init_empty_weights():
                # let's suppose that we can get the right config
                model_8bit_from_saved = AutoModelForCausalLM.from_config(AutoConfig.from_pretrained(self.model_name))
            model_8bit_from_saved.tie_weights()

            bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True)

            model_8bit_from_saved = load_and_quantize_model(
                model_8bit_from_saved,
                bnb_quantization_config,
                weights_location=tmpdirname,
                device_map="auto",
                no_split_module_classes=["BloomBlock"],
            )

            assert model_8bit_from_saved.transformer.h[0].mlp.dense_4h_to_h.weight.__class__ == Int8Params
            assert hasattr(model_8bit_from_saved.transformer.h[0].mlp.dense_4h_to_h.weight, "SCB")
            assert hasattr(model_8bit_from_saved.transformer.h[0].mlp.dense_4h_to_h.weight, "CB")

            self.check_inference_correctness(model_8bit_from_saved)

    @require_multi_device
    def test_int8_serialization_offload(self):
        r"""
        Test whether it is possible to serialize a model in 8-bit and offload weights to cpu/disk
        """
        from bitsandbytes.nn import Int8Params
        from transformers import AutoConfig, AutoModelForCausalLM

        with tempfile.TemporaryDirectory() as tmpdirname:
            # saving state dict for now but will save config and other in the future
            self.accelerate.save_model(self.model_8bit, tmpdirname)

            with init_empty_weights():
                # let's suppose that we can get the right config
                model_8bit_from_saved = AutoModelForCausalLM.from_config(AutoConfig.from_pretrained(self.model_name))
            model_8bit_from_saved.tie_weights()
            bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True)
            device_map = {
                "transformer.word_embeddings": "cpu",
                "transformer.word_embeddings_layernorm": 0,
                "lm_head": "cpu",
                "transformer.h.0": "cpu",
                "transformer.h.1": "cpu",
                "transformer.h.2": "cpu",
                "transformer.h.3": "disk",
                "transformer.h.4": "disk",
                "transformer.h.5": "disk",
                "transformer.h.6": 0,
                "transformer.h.7": 0,
                "transformer.h.8": 0,
                "transformer.h.9": 1,
                "transformer.h.10": 0,
                "transformer.h.11": 1,
                "transformer.h.12": 0,
                "transformer.h.13": 0,
                "transformer.h.14": 1,
                "transformer.h.15": 0,
                "transformer.h.16": 0,
                "transformer.h.17": 1,
                "transformer.h.18": 1,
                "transformer.h.19": 0,
                "transformer.h.20": 1,
                "transformer.h.21": 1,
                "transformer.h.22": 0,
                "transformer.h.23": 0,
                "transformer.ln_f": 1,
            }
            model_8bit_from_saved = load_and_quantize_model(
                model_8bit_from_saved,
                bnb_quantization_config,
                weights_location=tmpdirname,
                device_map=device_map,
                no_split_module_classes=["BloomBlock"],
                offload_folder=tmpdirname + "/tmp",
                offload_state_dict=True,
            )

            assert model_8bit_from_saved.transformer.h[4].mlp.dense_4h_to_h.weight.__class__ == Int8Params
            assert model_8bit_from_saved.transformer.h[5].mlp.dense_4h_to_h.weight.__class__ == Int8Params
            self.check_inference_correctness(model_8bit_from_saved)

    def test_int8_serialization_shard(self):
        r"""
        Test whether it is possible to serialize a model in 8-bit.
        """
        from bitsandbytes.nn import Int8Params
        from transformers import AutoConfig, AutoModelForCausalLM

        with tempfile.TemporaryDirectory() as tmpdirname:
            # saving state dict for now but will save config and other in the future
            self.accelerate.save_model(self.model_8bit, tmpdirname, max_shard_size="1GB")

            with init_empty_weights():
                # let's suppose that we can get the right config
                model_8bit_from_saved = AutoModelForCausalLM.from_config(AutoConfig.from_pretrained(self.model_name))

            model_8bit_from_saved.tie_weights()

            bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True)

            model_8bit_from_saved = load_and_quantize_model(
                model_8bit_from_saved,
                bnb_quantization_config,
                weights_location=tmpdirname,
                device_map="auto",
                no_split_module_classes=["BloomBlock"],
            )

            assert model_8bit_from_saved.transformer.h[0].mlp.dense_4h_to_h.weight.__class__ == Int8Params
            assert hasattr(model_8bit_from_saved.transformer.h[0].mlp.dense_4h_to_h.weight, "SCB")
            assert hasattr(model_8bit_from_saved.transformer.h[0].mlp.dense_4h_to_h.weight, "CB")

            self.check_inference_correctness(model_8bit_from_saved)


@require_non_torch_xla
@slow
@require_cuda_or_xpu
@require_bnb
@require_huggingface_suite
class MixedInt8LoaddedModelTest(unittest.TestCase):
    # We keep the constants inside the init function and model loading inside setUp function

    # We need to test on relatively large models (aka >1b parameters otherwise the quantiztion may not work as expected)
    # Therefore here we use only bloom-1b3 to test our module
    model_name = "marcsun13/bloom-1b7_with_lm_head"

    # Constant values
    # This was obtained on a Quadro RTX 8000 so the number might slightly change
    EXPECTED_RELATIVE_DIFFERENCE = 1.540025

    input_text = "Hello my name is"
    EXPECTED_OUTPUT = "Hello my name is John.\nI am a friend of the family.\n"
    MAX_NEW_TOKENS = 10

    def setUp(self):
        """
        Setup quantized model from loaded model
        """
        from transformers import AutoModelForCausalLM, AutoTokenizer

        # Models and tokenizer
        self.model_fp16 = AutoModelForCausalLM.from_pretrained(
            self.model_name, torch_dtype=torch.float16, device_map="auto"
        )

        self.bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True)

        self.model_8bit = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype=torch.float16)
        self.model_8bit = load_and_quantize_model(self.model_8bit, self.bnb_quantization_config)

        self.tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-1b7")

    def tearDown(self):
        r"""
        TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
        avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
        """
        del self.model_fp16
        del self.model_8bit

        clear_device_cache(garbage_collection=True)

    def test_memory_footprint(self):
        r"""
        A simple test to check if the model conversion has been done correctly by checking on the
        memory footprint of the converted model and the class type of the linear layers of the converted models
        """
        from bitsandbytes.nn import Int8Params

        mem_fp16 = self.model_fp16.get_memory_footprint()
        mem_8bit = self.model_8bit.get_memory_footprint()

        assert round((mem_fp16 / mem_8bit) - self.EXPECTED_RELATIVE_DIFFERENCE, 7) >= 0
        assert self.model_8bit.transformer.h[0].mlp.dense_4h_to_h.weight.__class__ == Int8Params

    def test_linear_are_8bit(self):
        r"""
        A simple test to check if the model conversion has been done correctly by checking on the
        memory footprint of the converted model and the class type of the linear layers of the converted models
        """

        self.model_fp16.get_memory_footprint()
        self.model_8bit.get_memory_footprint()

        for name, module in self.model_8bit.named_modules():
            if isinstance(module, torch.nn.Linear):
                modules_not_converted = (
                    self.bnb_quantization_config.keep_in_fp32_modules + self.bnb_quantization_config.skip_modules
                )
                if name not in modules_not_converted:
                    assert module.weight.dtype == torch.int8

    def test_generate_quality(self):
        r"""
        Test the generation quality of the quantized model and see that we are matching the expected output.
        Given that we are operating on small numbers + the testing model is relatively small, we might not get
        the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
        """
        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")

        output_sequences = self.model_8bit.generate(
            input_ids=encoded_input["input_ids"].to(self.model_8bit.device), max_new_tokens=10
        )

        assert self.tokenizer.decode(output_sequences[0], skip_special_tokens=True) == self.EXPECTED_OUTPUT

    def test_fp32_8bit_conversion(self):
        r"""
        Test whether it is possible to mix both `8bit` and `fp32` weights when using `keep_in_fp32_modules` correctly.
        """
        from transformers import AutoModelForCausalLM

        bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True, keep_in_fp32_modules=["lm_head"])

        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype=torch.float16)
        model = load_and_quantize_model(model, bnb_quantization_config)
        assert model.lm_head.weight.dtype == torch.float32


@require_non_torch_xla
@slow
@require_cuda_or_xpu
@require_bnb
@require_huggingface_suite
class Bnb4BitEmptyModelTest(unittest.TestCase):
    # We keep the constants inside the init function and model loading inside setUp function

    # We need to test on relatively large models (aka >1b parameters otherwise the quantiztion may not work as expected)
    # Therefore here we use only bloom-1b3 to test our module
    model_name = "marcsun13/bloom-1b7_with_lm_head"

    # Constant values
    # This was obtained on a RTX Titan so the number might slightly change
    EXPECTED_RELATIVE_DIFFERENCE = 2.109659552692574

    input_text = "Hello my name is"
    EXPECTED_OUTPUTS = set()
    EXPECTED_OUTPUTS.add("Hello my name is John and I am a professional photographer. I")
    EXPECTED_OUTPUTS.add("Hello my name is John.\nI am a friend of your father.\n")
    MAX_NEW_TOKENS = 10

    def setUp(self):
        from huggingface_hub import hf_hub_download
        from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer

        super().setUp()

        # Models and tokenizer
        self.model_fp16 = AutoModelForCausalLM.from_pretrained(
            self.model_name, torch_dtype=torch.float16, device_map="auto"
        )

        # create model on meta device
        with init_empty_weights():
            self.model_4bit = AutoModelForCausalLM.from_config(AutoConfig.from_pretrained(self.model_name))

        self.model_4bit.tie_weights()
        self.weights_location = hf_hub_download(self.model_name, "pytorch_model.bin")
        self.bnb_quantization_config = BnbQuantizationConfig(load_in_4bit=True)

        self.model_4bit = load_and_quantize_model(
            self.model_4bit,
            self.bnb_quantization_config,
            weights_location=self.weights_location,
            device_map={"": 0},
            no_split_module_classes=["BloomBlock"],
        )

        self.tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-1b7")

    def tearDown(self):
        """
        TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
        avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
        """
        super().tearDown()
        del self.model_fp16
        del self.model_4bit

        clear_device_cache(garbage_collection=True)

    def test_memory_footprint(self):
        r"""
        A simple test to check if the model conversion has been done correctly by checking on the
        memory footprint of the converted model and the class type of the linear layers of the converted models
        """
        from bitsandbytes.nn import Params4bit

        mem_fp16 = self.model_fp16.get_memory_footprint()
        mem_4bit = self.model_4bit.get_memory_footprint()

        assert round((mem_fp16 / mem_4bit) - self.EXPECTED_RELATIVE_DIFFERENCE, 7) >= 0
        assert self.model_4bit.transformer.h[0].mlp.dense_4h_to_h.weight.__class__ == Params4bit

    def check_inference_correctness(self, model):
        r"""
        Test the generation quality of the quantized model and see that we are matching the expected output.
        Given that we are operating on small numbers + the testing model is relatively small, we might not get
        the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
        """
        # Check that inference pass works on the model
        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")

        # Check the exactness of the results
        output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)

        assert self.tokenizer.decode(output_sequences[0], skip_special_tokens=True) in self.EXPECTED_OUTPUTS

    def test_generate_quality(self):
        self.check_inference_correctness(self.model_4bit)

    def test_linear_are_4bit(self):
        r"""
        A simple test to check if the model conversion has been done correctly by checking on the
        memory footprint of the converted model and the class type of the linear layers of the converted models
        """

        self.model_fp16.get_memory_footprint()
        self.model_4bit.get_memory_footprint()

        for name, module in self.model_4bit.named_modules():
            if isinstance(module, torch.nn.Linear):
                if (
                    name
                    not in self.bnb_quantization_config.keep_in_fp32_modules
                    + self.bnb_quantization_config.skip_modules
                ):
                    # 4-bit parameters are packed in uint8 variables
                    assert module.weight.dtype == torch.uint8

    def test_fp32_4bit_conversion(self):
        r"""
        Test whether it is possible to mix both `4bit` and `fp32` weights when using `keep_in_fp32_modules` correctly.
        """
        from transformers import AutoConfig, AutoModelForCausalLM

        bnb_quantization_config = BnbQuantizationConfig(load_in_4bit=True, keep_in_fp32_modules=["lm_head"])

        with init_empty_weights():
            model = AutoModelForCausalLM.from_config(AutoConfig.from_pretrained(self.model_name))

        model.tie_weights()
        model = load_and_quantize_model(
            model,
            bnb_quantization_config,
            weights_location=self.weights_location,
            device_map="auto",
            no_split_module_classes=["BloomBlock"],
        )
        assert model.lm_head.weight.dtype == torch.float32

    @require_multi_device
    def test_cpu_gpu_loading_random_device_map(self):
        from transformers import AutoConfig, AutoModelForCausalLM

        r"""
        A test to check is dispatching a model on cpu & gpu works correctly using a random `device_map`.
        """
        device_map = {
            "transformer.word_embeddings": "cpu",
            "transformer.word_embeddings_layernorm": 0,
            "lm_head": "cpu",
            "transformer.h.0": 0,
            "transformer.h.1": 0,
            "transformer.h.2": 0,
            "transformer.h.3": 0,
            "transformer.h.4": 0,
            "transformer.h.5": 0,
            "transformer.h.6": 0,
            "transformer.h.7": 0,
            "transformer.h.8": 0,
            "transformer.h.9": 1,
            "transformer.h.10": 0,
            "transformer.h.11": 1,
            "transformer.h.12": 0,
            "transformer.h.13": 0,
            "transformer.h.14": 1,
            "transformer.h.15": 0,
            "transformer.h.16": 0,
            "transformer.h.17": 1,
            "transformer.h.18": 1,
            "transformer.h.19": 0,
            "transformer.h.20": 1,
            "transformer.h.21": 1,
            "transformer.h.22": 0,
            "transformer.h.23": 0,
            "transformer.ln_f": 1,
        }

        bnb_quantization_config = BnbQuantizationConfig(load_in_4bit=True)

        with init_empty_weights():
            model_4bit = AutoModelForCausalLM.from_config(AutoConfig.from_pretrained(self.model_name))

        model_4bit.tie_weights()
        model_4bit = load_and_quantize_model(
            model_4bit,
            bnb_quantization_config,
            weights_location=self.weights_location,
            device_map=device_map,
            no_split_module_classes=["BloomBlock"],
        )
        self.check_inference_correctness(model_4bit)

    @require_multi_device
    def test_cpu_gpu_loading_custom_device_map(self):
        from transformers import AutoConfig, AutoModelForCausalLM

        r"""
        A test to check is dispatching a model on cpu & gpu works correctly using a random `device_map`.
        """
        device_map = {
            "transformer.word_embeddings": "cpu",
            "transformer.word_embeddings_layernorm": "cpu",
            "lm_head": "cpu",
            "transformer.h": 0,
            "transformer.ln_f": 1,
        }

        bnb_quantization_config = BnbQuantizationConfig(load_in_4bit=True)

        with init_empty_weights():
            model_4bit = AutoModelForCausalLM.from_config(AutoConfig.from_pretrained(self.model_name))

        model_4bit.tie_weights()
        model_4bit = load_and_quantize_model(
            model_4bit,
            bnb_quantization_config,
            weights_location=self.weights_location,
            device_map=device_map,
            no_split_module_classes=["BloomBlock"],
        )
        self.check_inference_correctness(model_4bit)

    @require_multi_device
    def test_cpu_gpu_disk_loading_custom_device_map_kwargs(self):
        from transformers import AutoConfig, AutoModelForCausalLM

        r"""
        A test to check is dispatching a model on cpu & gpu works correctly using a custom `device_map`.
        This time we also add `disk` on the device_map - using the kwargs directly instead of the quantization config
        """
        device_map = {
            "transformer.word_embeddings": 0,
            "transformer.word_embeddings_layernorm": "disk",
            "lm_head": 0,
            "transformer.h": 1,
            "transformer.ln_f": "cpu",
        }
        bnb_quantization_config = BnbQuantizationConfig(load_in_4bit=True)

        with init_empty_weights():
            model_4bit = AutoModelForCausalLM.from_config(AutoConfig.from_pretrained(self.model_name))

        model_4bit.tie_weights()
        with tempfile.TemporaryDirectory() as tmpdirname:
            model_4bit = load_and_quantize_model(
                model_4bit,
                bnb_quantization_config,
                weights_location=self.weights_location,
                device_map=device_map,
                no_split_module_classes=["BloomBlock"],
                offload_folder=tmpdirname,
                offload_state_dict=True,
            )
            self.check_inference_correctness(model_4bit)


@require_non_torch_xla
@slow
@require_cuda_or_xpu
@require_bnb
@require_huggingface_suite
class Bnb4BitTestLoadedModel(unittest.TestCase):
    # We keep the constants inside the init function and model loading inside setUp function

    # We need to test on relatively large models (aka >1b parameters otherwise the quantiztion may not work as expected)
    # Therefore here we use only bloom-1b3 to test our module
    model_name = "marcsun13/bloom-1b7_with_lm_head"

    # Constant values
    # This was obtained on a RTX Titan so the number might slightly change
    EXPECTED_RELATIVE_DIFFERENCE = 2.109659552692574

    input_text = "Hello my name is"
    EXPECTED_OUTPUTS = set()
    EXPECTED_OUTPUTS.add("Hello my name is John and I am a professional photographer. I")
    EXPECTED_OUTPUTS.add("Hello my name is John.\nI am a friend of your father.\n")
    MAX_NEW_TOKENS = 10

    def setUp(self):
        """
        Setup quantized model from loaded model
        """
        from transformers import AutoModelForCausalLM, AutoTokenizer

        super().setUp()

        # Models and tokenizer
        self.model_fp16 = AutoModelForCausalLM.from_pretrained(
            self.model_name, torch_dtype=torch.float16, device_map="auto"
        )

        self.bnb_quantization_config = BnbQuantizationConfig(load_in_4bit=True)

        self.model_4bit = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype=torch.float16)
        self.model_4bit = load_and_quantize_model(self.model_4bit, self.bnb_quantization_config)

        self.tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-1b7")

    def tearDown(self):
        """
        TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
        avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
        """
        super().tearDown()
        del self.model_fp16
        del self.model_4bit

        clear_device_cache(garbage_collection=True)

    def test_memory_footprint(self):
        r"""
        A simple test to check if the model conversion has been done correctly by checking on the
        memory footprint of the converted model and the class type of the linear layers of the converted models
        """
        from bitsandbytes.nn import Params4bit

        mem_fp16 = self.model_fp16.get_memory_footprint()
        mem_4bit = self.model_4bit.get_memory_footprint()

        assert round((mem_fp16 / mem_4bit) - self.EXPECTED_RELATIVE_DIFFERENCE, 7) >= 0
        assert self.model_4bit.transformer.h[0].mlp.dense_4h_to_h.weight.__class__ == Params4bit

    def test_linear_are_4bit(self):
        r"""
        A simple test to check if the model conversion has been done correctly by checking on the
        memory footprint of the converted model and the class type of the linear layers of the converted models
        """

        self.model_fp16.get_memory_footprint()
        self.model_4bit.get_memory_footprint()

        for name, module in self.model_4bit.named_modules():
            if isinstance(module, torch.nn.Linear):
                if (
                    name
                    not in self.bnb_quantization_config.keep_in_fp32_modules
                    + self.bnb_quantization_config.skip_modules
                ):
                    # 4-bit parameters are packed in uint8 variables
                    assert module.weight.dtype == torch.uint8

    def test_generate_quality(self):
        r"""
        Test the generation quality of the quantized model and see that we are matching the expected output.
        Given that we are operating on small numbers + the testing model is relatively small, we might not get
        the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
        """
        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")

        output_sequences = self.model_4bit.generate(
            input_ids=encoded_input["input_ids"].to(self.model_4bit.device), max_new_tokens=10
        )

        assert self.tokenizer.decode(output_sequences[0], skip_special_tokens=True) in self.EXPECTED_OUTPUTS

    def test_fp32_4bit_conversion(self):
        r"""
        Test whether it is possible to mix both `4bit` and `fp32` weights when using `keep_in_fp32_modules` correctly.
        """
        from transformers import AutoModelForCausalLM

        bnb_quantization_config = BnbQuantizationConfig(load_in_4bit=True, keep_in_fp32_modules=["lm_head"])

        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype=torch.float16)
        model = load_and_quantize_model(model, bnb_quantization_config)
        assert model.lm_head.weight.dtype == torch.float32


================================================
FILE: tests/test_sagemaker.py
================================================
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from dataclasses import dataclass

import pytest

from accelerate.commands.config.config_args import SageMakerConfig
from accelerate.utils import ComputeEnvironment
from accelerate.utils.launch import _convert_nargs_to_dict


@dataclass
class MockLaunchConfig(SageMakerConfig):
    compute_environment = ComputeEnvironment.AMAZON_SAGEMAKER
    fp16 = True
    ec2_instance_type = "ml.p3.2xlarge"
    iam_role_name = "accelerate_sagemaker_execution_role"
    profile = "hf-sm"
    region = "us-east-1"
    num_machines = 1
    base_job_name = "accelerate-sagemaker-1"
    pytorch_version = "1.6"
    transformers_version = "4.4"
    training_script = "train.py"
    success_training_script_args = [
        "--model_name_or_path",
        "bert",
        "--do_train",
        "False",
        "--epochs",
        "3",
        "--learning_rate",
        "5e-5",
        "--max_steps",
        "50.5",
    ]
    fail_training_script_args = [
        "--model_name_or_path",
        "bert",
        "--do_train",
        "--do_test",
        "False",
        "--do_predict",
        "--epochs",
        "3",
        "--learning_rate",
        "5e-5",
        "--max_steps",
        "50.5",
    ]


class SageMakerLaunch(unittest.TestCase):
    def test_args_convert(self):
        # If no defaults are changed, `to_kwargs` returns an empty dict.
        converted_args = _convert_nargs_to_dict(MockLaunchConfig.success_training_script_args)
        assert isinstance(converted_args["model_name_or_path"], str)
        assert isinstance(converted_args["do_train"], bool)
        assert isinstance(converted_args["epochs"], int)
        assert isinstance(converted_args["learning_rate"], float)
        assert isinstance(converted_args["max_steps"], float)

        with pytest.raises(ValueError):
            _convert_nargs_to_dict(MockLaunchConfig.fail_training_script_args)


================================================
FILE: tests/test_samples/MRPC/dev.csv
================================================
label,sentence1,sentence2
equivalent,He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .,""" The foodservice pie business does not fit our long-term growth strategy ."
not_equivalent,Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .,"His wife said he was "" 100 percent behind George Bush "" and looked forward to using his years of training in the war ."
not_equivalent,"The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .","The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent ."
equivalent,The AFL-CIO is waiting until October to decide if it will endorse a candidate .,The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
not_equivalent,No dates have been set for the civil or the criminal trial .,"No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty ."
equivalent,Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed .,It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .


================================================
FILE: tests/test_samples/MRPC/train.csv
================================================
label,sentence1,sentence2
equivalent,He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .,""" The foodservice pie business does not fit our long-term growth strategy ."
not_equivalent,Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .,"His wife said he was "" 100 percent behind George Bush "" and looked forward to using his years of training in the war ."
not_equivalent,"The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .","The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent ."
equivalent,The AFL-CIO is waiting until October to decide if it will endorse a candidate .,The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
not_equivalent,No dates have been set for the civil or the criminal trial .,"No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty ."
equivalent,Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed .,It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .


================================================
FILE: tests/test_samples/test_command_file.sh
================================================
echo "hello world"
echo "this is a second command"

================================================
FILE: tests/test_scheduler.py
================================================
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest
from functools import partial

import torch

from accelerate import Accelerator, debug_launcher
from accelerate.state import AcceleratorState, GradientState
from accelerate.test_utils import require_cpu, require_huggingface_suite
from accelerate.utils import GradientAccumulationPlugin


def one_cycle_test(num_processes=2, step_scheduler_with_optimizer=True, split_batches=False):
    accelerator = Accelerator(step_scheduler_with_optimizer=step_scheduler_with_optimizer, split_batches=split_batches)
    model = torch.nn.Linear(2, 4)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1.0)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.01, steps_per_epoch=2, epochs=1)
    model, optimizer, scheduler = accelerator.prepare(model, optimizer, scheduler)

    # Optimizer has stepped
    scheduler.step()
    if step_scheduler_with_optimizer or (num_processes == 1):
        assert scheduler.scheduler.last_epoch == num_processes, (
            f"Last Epoch ({scheduler.scheduler.last_epoch}) != Num Processes ({num_processes})"
        )
    else:
        assert scheduler.scheduler.last_epoch != num_processes, (
            f"Last Epoch ({scheduler.scheduler.last_epoch}) == Num Processes ({num_processes})"
        )


def lambda_test(num_processes=2, step_scheduler_with_optimizer=True, split_batches=False):
    accelerator = Accelerator(step_scheduler_with_optimizer=step_scheduler_with_optimizer, split_batches=split_batches)
    model = torch.nn.Linear(2, 4)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1.0)
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda n: 1 - n / 10)
    model, optimizer, scheduler = accelerator.prepare(model, optimizer, scheduler)

    # Optimizer has stepped
    optimizer._is_overflow = False
    scheduler.step()
    expected_lr = 1 - (num_processes if (step_scheduler_with_optimizer and not split_batches) else 1) / 10
    assert scheduler.get_last_lr()[0] == expected_lr, (
        f"Wrong lr found at first step, expected {expected_lr}, got {scheduler.get_last_lr()[0]}"
    )

    # Optimizer has not stepped
    optimizer._is_overflow = True
    scheduler.step()
    if not step_scheduler_with_optimizer:
        expected_lr = 1 - 2 / 10
    assert scheduler.get_last_lr()[0] == expected_lr, (
        f"Wrong lr found at second step, expected {expected_lr}, got {scheduler.get_last_lr()[0]}"
    )


def accumulation_test(num_processes: int = 2):
    """
    With this test, an observed batch size of 64 should result in negligible
    differences in the scheduler after going through the correct number of steps.

    Uses single, two, and four steps to test.
    """
    from transformers import get_linear_schedule_with_warmup

    steps = [1, 2, 4]
    for num_steps in steps:
        plugin = GradientAccumulationPlugin(num_steps=num_steps, adjust_scheduler=num_steps > 1)
        accelerator = Accelerator(gradient_accumulation_plugin=plugin)
        model = torch.nn.Linear(2, 4)
        optimizer = torch.optim.AdamW(model.parameters(), lr=10.0)
        scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=0, num_training_steps=20)

        model, optimizer, scheduler = accelerator.prepare(model, optimizer, scheduler)

        for i in range(10 * num_steps):
            with accelerator.accumulate(model):
                optimizer.step()
                scheduler.step()

            if i == (10 * num_steps - 2):
                assert scheduler.get_last_lr()[0] != 0, (
                    f"Wrong lr found at second-to-last step, expected non-zero, got {scheduler.get_last_lr()[0]}. num_steps: {num_steps}"
                )
        assert scheduler.get_last_lr()[0] == 0, (
            f"Wrong lr found at last step, expected 0, got {scheduler.get_last_lr()[0]}"
        )
        GradientState._reset_state()


@require_cpu
class SchedulerTester(unittest.TestCase):
    def test_lambda_scheduler_steps_with_optimizer_single_process(self):
        debug_launcher(partial(lambda_test, num_processes=1), num_processes=1)
        debug_launcher(partial(lambda_test, num_processes=1, split_batches=True), num_processes=1)

    def test_one_cycle_scheduler_steps_with_optimizer_single_process(self):
        debug_launcher(partial(one_cycle_test, num_processes=1), num_processes=1)
        debug_launcher(partial(one_cycle_test, num_processes=1, split_batches=True), num_processes=1)

    def test_lambda_scheduler_not_step_with_optimizer_single_process(self):
        debug_launcher(partial(lambda_test, num_processes=1, step_scheduler_with_optimizer=False), num_processes=1)

    def test_one_cycle_scheduler_not_step_with_optimizer_single_process(self):
        debug_launcher(partial(one_cycle_test, num_processes=1, step_scheduler_with_optimizer=False), num_processes=1)

    def test_lambda_scheduler_steps_with_optimizer_multiprocess(self):
        AcceleratorState._reset_state(True)
        debug_launcher(lambda_test)
        debug_launcher(partial(lambda_test, num_processes=1, split_batches=True), num_processes=1)

    def test_one_cycle_scheduler_steps_with_optimizer_multiprocess(self):
        AcceleratorState._reset_state(True)
        debug_launcher(one_cycle_test)
        debug_launcher(partial(one_cycle_test, num_processes=1, split_batches=True), num_processes=1)

    def test_lambda_scheduler_not_step_with_optimizer_multiprocess(self):
        AcceleratorState._reset_state(True)
        debug_launcher(partial(lambda_test, step_scheduler_with_optimizer=False))

    def test_one_cycle_scheduler_not_step_with_optimizer_multiprocess(self):
        AcceleratorState._reset_state(True)
        debug_launcher(partial(one_cycle_test, step_scheduler_with_optimizer=False))

    @require_huggingface_suite
    def test_accumulation(self):
        AcceleratorState._reset_state(True)
        debug_launcher(partial(accumulation_test, num_processes=1))
        debug_launcher(accumulation_test)


================================================
FILE: tests/test_state_checkpointing.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import inspect
import logging
import os
import random
import shutil
import tempfile
import uuid
from contextlib import contextmanager

import pytest
import torch
from parameterized import parameterized_class
from torch import nn
from torch.utils.data import DataLoader, TensorDataset

from accelerate import Accelerator
from accelerate.test_utils import (
    DEFAULT_LAUNCH_COMMAND,
    execute_subprocess_async,
    require_non_cpu,
    require_non_torch_xla,
    run_first,
)
from accelerate.test_utils.testing import AccelerateTestCase
from accelerate.utils import DistributedType, ProjectConfiguration, patch_environment, set_seed


logger = logging.getLogger(__name__)


def dummy_dataloaders(a=2, b=3, batch_size=16, n_train_batches: int = 10, n_valid_batches: int = 2):
    "Generates a tuple of dummy DataLoaders to test with"

    def get_dataset(n_batches):
        x = torch.randn(batch_size * n_batches, 1)
        return TensorDataset(x, a * x + b + 0.1 * torch.randn(batch_size * n_batches, 1))

    train_dataset = get_dataset(n_train_batches)
    valid_dataset = get_dataset(n_valid_batches)
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, num_workers=4)
    valid_dataloader = DataLoader(valid_dataset, shuffle=False, batch_size=batch_size, num_workers=4)
    return (train_dataloader, valid_dataloader)


def train(num_epochs, model, dataloader, optimizer, accelerator, scheduler=None):
    "Trains for `num_epochs`"
    rands = []
    for epoch in range(num_epochs):
        # Train quickly
        model.train()
        for batch in dataloader:
            x, y = batch
            outputs = model(x)
            loss = torch.nn.functional.mse_loss(outputs, y)
            accelerator.backward(loss)
            optimizer.step()
            optimizer.zero_grad()
        rands.append(random.random())  # Introduce some randomness
        if scheduler is not None:
            scheduler.step()
    return rands


class DummyModel(nn.Module):
    "Simple model to do y=mx+b"

    def __init__(self):
        super().__init__()
        self.a = nn.Parameter(torch.randn(1))
        self.b = nn.Parameter(torch.randn(1))

    def forward(self, x):
        return x * self.a + self.b


def parameterized_custom_name_func(func, param_num, param):
    # customize the test name generator function as we want both params to appear in the sub-test
    # name, as by default it shows only the first param
    param_based_name = "use_safetensors" if param["use_safetensors"] is True else "use_pytorch"
    return f"{func.__name__}_{param_based_name}"


@parameterized_class(("use_safetensors",), [[True], [False]], class_name_func=parameterized_custom_name_func)
class CheckpointTest(AccelerateTestCase):
    def check_adam_state(self, state1, state2, distributed_type):
        # For DistributedType.XLA, the `accelerator.save_state` function calls `xm._maybe_convert_to_cpu` before saving.
        # As a result, all tuple values are converted to lists. Therefore, we need to convert them back here.
        # Remove this code once Torch XLA fixes this issue.
        if distributed_type == DistributedType.XLA:
            state1["param_groups"][0]["betas"] = tuple(state1["param_groups"][0]["betas"])
            state2["param_groups"][0]["betas"] = tuple(state2["param_groups"][0]["betas"])
        assert state1 == state2

    def test_with_save_limit(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            set_seed(42)
            model = DummyModel()
            optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)
            train_dataloader, valid_dataloader = dummy_dataloaders()
            project_config = ProjectConfiguration(total_limit=1, project_dir=tmpdir, automatic_checkpoint_naming=True)
            # Train baseline
            accelerator = Accelerator(project_config=project_config)
            model, optimizer, train_dataloader, valid_dataloader = accelerator.prepare(
                model, optimizer, train_dataloader, valid_dataloader
            )
            # Save initial
            accelerator.save_state(safe_serialization=self.use_safetensors)

            # Save second state
            accelerator.save_state(safe_serialization=self.use_safetensors)
            assert len(os.listdir(accelerator.project_dir)) == 1

    def test_can_resume_training_with_folder(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            set_seed(42)
            model = DummyModel()
            optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)
            train_dataloader, valid_dataloader = dummy_dataloaders()
            # Train baseline
            accelerator = Accelerator()
            model, optimizer, train_dataloader, valid_dataloader = accelerator.prepare(
                model, optimizer, train_dataloader, valid_dataloader
            )
            # Save initial
            initial = os.path.join(tmpdir, "initial")
            accelerator.save_state(initial, safe_serialization=self.use_safetensors)
            (a, b) = model.a.item(), model.b.item()
            opt_state = optimizer.state_dict()
            ground_truth_rands = train(3, model, train_dataloader, optimizer, accelerator)
            (a1, b1) = model.a.item(), model.b.item()
            opt_state1 = optimizer.state_dict()

            # Train partially
            set_seed(42)
            model = DummyModel()
            optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)
            train_dataloader, valid_dataloader = dummy_dataloaders()
            accelerator = Accelerator()
            model, optimizer, train_dataloader, valid_dataloader = accelerator.prepare(
                model, optimizer, train_dataloader, valid_dataloader
            )
            accelerator.load_state(initial)
            (a2, b2) = model.a.item(), model.b.item()
            opt_state2 = optimizer.state_dict()
            self.assertEqual(a, a2)
            self.assertEqual(b, b2)
            assert a == a2
            assert b == b2
            self.check_adam_state(opt_state, opt_state2, accelerator.distributed_type)

            test_rands = train(2, model, train_dataloader, optimizer, accelerator)
            # Save everything
            checkpoint = os.path.join(tmpdir, "checkpoint")
            accelerator.save_state(checkpoint, safe_serialization=self.use_safetensors)

            # Load everything back in and make sure all states work
            accelerator.load_state(checkpoint)
            test_rands += train(1, model, train_dataloader, optimizer, accelerator)
            (a3, b3) = model.a.item(), model.b.item()
            opt_state3 = optimizer.state_dict()
            assert a1 == a3
            assert b1 == b3
            self.check_adam_state(opt_state1, opt_state3, accelerator.distributed_type)
            assert ground_truth_rands == test_rands

    def test_can_resume_training(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            set_seed(42)
            model = DummyModel()
            optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)
            train_dataloader, valid_dataloader = dummy_dataloaders()
            project_config = ProjectConfiguration(automatic_checkpoint_naming=True)

            # Train baseline
            accelerator = Accelerator(project_dir=tmpdir, project_config=project_config)
            model, optimizer, train_dataloader, valid_dataloader = accelerator.prepare(
                model, optimizer, train_dataloader, valid_dataloader
            )
            # Save initial
            accelerator.save_state(safe_serialization=self.use_safetensors)
            (a, b) = model.a.item(), model.b.item()
            opt_state = optimizer.state_dict()
            ground_truth_rands = train(3, model, train_dataloader, optimizer, accelerator)
            (a1, b1) = model.a.item(), model.b.item()
            opt_state1 = optimizer.state_dict()

            # Train partially
            set_seed(42)
            model = DummyModel()
            optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)
            train_dataloader, valid_dataloader = dummy_dataloaders()
            project_config = ProjectConfiguration(iteration=1, automatic_checkpoint_naming=True)
            accelerator = Accelerator(project_dir=tmpdir, project_config=project_config)
            model, optimizer, train_dataloader, valid_dataloader = accelerator.prepare(
                model, optimizer, train_dataloader, valid_dataloader
            )
            accelerator.load_state(os.path.join(tmpdir, "checkpoints", "checkpoint_0"))
            (a2, b2) = model.a.item(), model.b.item()
            opt_state2 = optimizer.state_dict()
            assert a == a2
            assert b == b2
            self.check_adam_state(opt_state, opt_state2, accelerator.distributed_type)

            test_rands = train(2, model, train_dataloader, optimizer, accelerator)
            # Save everything
            accelerator.save_state(safe_serialization=self.use_safetensors)

            # Load everything back in and make sure all states work
            accelerator.load_state(os.path.join(tmpdir, "checkpoints", "checkpoint_1"))
            test_rands += train(1, model, train_dataloader, optimizer, accelerator)
            (a3, b3) = model.a.item(), model.b.item()
            opt_state3 = optimizer.state_dict()
            assert a1 == a3
            assert b1 == b3
            self.check_adam_state(opt_state1, opt_state3, accelerator.distributed_type)
            assert ground_truth_rands == test_rands

    def test_can_resume_training_checkpoints_relative_path(self):
        # See #1983
        # This test is like test_can_resume_training but uses a relative path for the checkpoint and automatically
        # infers the checkpoint path when loading.
        @contextmanager
        def temporary_relative_directory():
            # This is equivalent to tempfile.TemporaryDirectory() except that it returns a relative path
            rand_dir = f"test_path_{uuid.uuid4()}"
            os.mkdir(rand_dir)
            try:
                yield rand_dir
            finally:
                shutil.rmtree(rand_dir)

        with temporary_relative_directory() as tmpdir:
            set_seed(42)
            model = DummyModel()
            optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)
            train_dataloader, valid_dataloader = dummy_dataloaders()
            project_config = ProjectConfiguration(automatic_checkpoint_naming=True)

            # Train baseline
            accelerator = Accelerator(project_dir=tmpdir, project_config=project_config)
            model, optimizer, train_dataloader, valid_dataloader = accelerator.prepare(
                model, optimizer, train_dataloader, valid_dataloader
            )
            # Save initial
            accelerator.save_state(safe_serialization=self.use_safetensors)
            (a, b) = model.a.item(), model.b.item()
            opt_state = optimizer.state_dict()
            ground_truth_rands = train(3, model, train_dataloader, optimizer, accelerator)
            (a1, b1) = model.a.item(), model.b.item()
            opt_state1 = optimizer.state_dict()

            # Train partially
            set_seed(42)
            model = DummyModel()
            optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)
            train_dataloader, valid_dataloader = dummy_dataloaders()
            project_config = ProjectConfiguration(iteration=1, automatic_checkpoint_naming=True)
            accelerator = Accelerator(project_dir=tmpdir, project_config=project_config)
            model, optimizer, train_dataloader, valid_dataloader = accelerator.prepare(
                model, optimizer, train_dataloader, valid_dataloader
            )
            accelerator.load_state()  # <= infer the directory automatically
            (a2, b2) = model.a.item(), model.b.item()
            opt_state2 = optimizer.state_dict()
            assert a == a2
            assert b == b2
            self.check_adam_state(opt_state, opt_state2, accelerator.distributed_type)
            assert opt_state == opt_state2

            test_rands = train(2, model, train_dataloader, optimizer, accelerator)
            # Save everything
            accelerator.save_state(safe_serialization=self.use_safetensors)

            # Load everything back in and make sure all states work
            accelerator.load_state(os.path.join(tmpdir, "checkpoints", "checkpoint_1"))
            test_rands += train(1, model, train_dataloader, optimizer, accelerator)
            (a3, b3) = model.a.item(), model.b.item()
            opt_state3 = optimizer.state_dict()
            assert a1 == a3
            assert b1 == b3
            self.check_adam_state(opt_state1, opt_state3, accelerator.distributed_type)
            assert ground_truth_rands == test_rands

    def test_invalid_registration(self):
        t = torch.tensor([1, 2, 3])
        t1 = torch.tensor([2, 3, 4])
        net = DummyModel()
        opt = torch.optim.Adam(net.parameters())
        accelerator = Accelerator()
        with self.assertRaises(ValueError) as ve:
            accelerator.register_for_checkpointing(t, t1, net, opt)
        message = str(ve.exception)
        assert "Item at index 0" in message
        assert "Item at index 1" in message
        assert "Item at index 2" not in message
        assert "Item at index 3" not in message

    def test_with_scheduler(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            set_seed(42)
            model = DummyModel()
            optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)
            scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.99)
            train_dataloader, valid_dataloader = dummy_dataloaders()
            project_config = ProjectConfiguration(automatic_checkpoint_naming=True)
            # Train baseline
            accelerator = Accelerator(project_dir=tmpdir, project_config=project_config)
            model, optimizer, train_dataloader, valid_dataloader, scheduler = accelerator.prepare(
                model, optimizer, train_dataloader, valid_dataloader, scheduler
            )
            # Save initial
            accelerator.save_state(safe_serialization=self.use_safetensors)
            scheduler_state = scheduler.state_dict()
            train(3, model, train_dataloader, optimizer, accelerator, scheduler)
            assert scheduler_state != scheduler.state_dict()

            # Load everything back in and make sure all states work
            accelerator.load_state(os.path.join(tmpdir, "checkpoints", "checkpoint_0"))
            assert scheduler_state == scheduler.state_dict()

    def test_automatic_loading(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            set_seed(42)
            model = DummyModel()
            optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)
            scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.99)
            train_dataloader, valid_dataloader = dummy_dataloaders()
            project_config = ProjectConfiguration(automatic_checkpoint_naming=True)
            # Train baseline
            accelerator = Accelerator(project_dir=tmpdir, project_config=project_config)
            model, optimizer, train_dataloader, valid_dataloader, scheduler = accelerator.prepare(
                model, optimizer, train_dataloader, valid_dataloader, scheduler
            )
            # Save initial
            accelerator.save_state(safe_serialization=self.use_safetensors)
            train(2, model, train_dataloader, optimizer, accelerator, scheduler)
            (a2, b2) = model.a.item(), model.b.item()
            # Save a first time
            accelerator.save_state(safe_serialization=self.use_safetensors)
            train(1, model, train_dataloader, optimizer, accelerator, scheduler)
            (a3, b3) = model.a.item(), model.b.item()

            # Load back in the last saved checkpoint, should point to a2, b2
            accelerator.load_state()
            assert a3 != model.a.item()
            assert b3 != model.b.item()
            assert a2 == model.a.item()
            assert b2 == model.b.item()

    def test_checkpoint_deletion(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            set_seed(42)
            model = DummyModel()
            project_config = ProjectConfiguration(automatic_checkpoint_naming=True, total_limit=2)
            # Train baseline
            accelerator = Accelerator(project_dir=tmpdir, project_config=project_config)
            model = accelerator.prepare(model)
            # Save 3 states:
            for _ in range(11):
                accelerator.save_state(safe_serialization=self.use_safetensors)
            assert not os.path.exists(os.path.join(tmpdir, "checkpoints", "checkpoint_0"))
            assert os.path.exists(os.path.join(tmpdir, "checkpoints", "checkpoint_9"))
            assert os.path.exists(os.path.join(tmpdir, "checkpoints", "checkpoint_10"))

    @run_first
    @require_non_cpu
    @require_non_torch_xla
    def test_map_location(self):
        cmd = DEFAULT_LAUNCH_COMMAND + [inspect.getfile(self.__class__)]

        env_kwargs = dict(use_safe_tensors=str(self.use_safetensors), omp_num_threads="1")
        with patch_environment(**env_kwargs):
            execute_subprocess_async(cmd)


if __name__ == "__main__":
    use_safetensors = os.environ.get("USE_SAFETENSORS", "False") == "True"
    savedir = "/tmp/accelerate/state_checkpointing"
    model = DummyModel()
    optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.99)
    train_dataloader, valid_dataloader = dummy_dataloaders()
    project_config = ProjectConfiguration(automatic_checkpoint_naming=True)
    # Train baseline
    accelerator = Accelerator(project_dir=savedir, project_config=project_config, mixed_precision="no")
    if accelerator.process_index == 0:
        if os.path.exists(savedir):
            shutil.rmtree(savedir)
        os.makedirs(savedir)
    model, optimizer, train_dataloader, valid_dataloader, scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, valid_dataloader, scheduler
    )
    model, optimizer = accelerator.prepare(model, optimizer)
    train(3, model, train_dataloader, optimizer, accelerator, scheduler)
    # Check that the initial optimizer is loaded on the GPU
    for group in optimizer.param_groups:
        param_device = group["params"][0].device
        break
    assert param_device.type == accelerator.device.type
    model = model.cpu()
    accelerator.wait_for_everyone()
    accelerator.save_state(safe_serialization=use_safetensors)
    accelerator.wait_for_everyone()

    # Check CPU state
    accelerator.load_state(os.path.join(savedir, "checkpoints", "checkpoint_0"), map_location="cpu")
    for group in optimizer.param_groups:
        param_device = group["params"][0].device
        break
    assert param_device.type == torch.device("cpu").type, (
        f"Loaded optimizer states did not match, expected to be loaded on the CPU but got {param_device}"
    )

    # Check device state
    model.to(accelerator.device)
    accelerator.load_state(os.path.join(savedir, "checkpoints", "checkpoint_0"), map_location="on_device")
    for group in optimizer.param_groups:
        param_device = group["params"][0].device
        break
    assert param_device.type == accelerator.device.type, (
        f"Loaded optimizer states did not match, expected to be loaded on {accelerator.device} but got {param_device}"
    )

    # Check error
    with pytest.raises(TypeError, match="Unsupported optimizer map location passed"):
        accelerator.load_state(os.path.join(savedir, "checkpoints", "checkpoint_0"), map_location="invalid")
    accelerator.wait_for_everyone()
    if accelerator.process_index == 0:
        shutil.rmtree(savedir)
    accelerator.wait_for_everyone()


================================================
FILE: tests/test_tpu.py
================================================
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import sys
import unittest

from accelerate.test_utils import execute_subprocess_async, path_in_accelerate_package, require_tpu


class MultiTPUTester(unittest.TestCase):
    test_file_path = path_in_accelerate_package("test_utils", "scripts", "test_script.py")
    test_dir = os.path.dirname(__file__)

    @require_tpu
    def test_tpu(self):
        distributed_args = f"""
            {self.test_dir}/xla_spawn.py
            --num_cores 8
            {self.test_file_path}
        """.split()
        cmd = [sys.executable] + distributed_args
        execute_subprocess_async(cmd)


================================================
FILE: tests/test_tracking.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import csv
import json
import logging
import os
import random
import re
import subprocess
import tempfile
import unittest
import zipfile
from pathlib import Path
from typing import Optional
from unittest import mock

import numpy as np
import torch
from packaging import version

# We use TF to parse the logs
from accelerate import Accelerator
from accelerate.state import PartialState
from accelerate.test_utils.testing import (
    MockingTestCase,
    TempDirTestCase,
    require_aim,
    require_clearml,
    require_comet_ml,
    require_dvclive,
    require_matplotlib,
    require_mlflow,
    require_pandas,
    require_swanlab,
    require_tensorboard,
    require_trackio,
    require_wandb,
    skip,
)
from accelerate.tracking import (
    AimTracker,
    ClearMLTracker,
    CometMLTracker,
    DVCLiveTracker,
    GeneralTracker,
    MLflowTracker,
    SwanLabTracker,
    TensorBoardTracker,
    TrackioTracker,
    WandBTracker,
)
from accelerate.utils import (
    ProjectConfiguration,
    is_comet_ml_available,
    is_dvclive_available,
    is_tensorboard_available,
)


if is_comet_ml_available():
    from comet_ml import ExperimentConfig

if is_tensorboard_available():
    import struct

    import tensorboard.compat.proto.event_pb2 as event_pb2

if is_dvclive_available():
    from dvclive.plots.metric import Metric
    from dvclive.serialize import load_yaml
    from dvclive.utils import parse_metrics

logger = logging.getLogger(__name__)


@require_tensorboard
class TensorBoardTrackingTest(unittest.TestCase):
    @unittest.skipIf(version.parse(np.__version__) >= version.parse("2.0"), "TB doesn't support numpy 2.0")
    def test_init_trackers(self):
        project_name = "test_project_with_config"
        with tempfile.TemporaryDirectory() as dirpath:
            accelerator = Accelerator(log_with="tensorboard", project_dir=dirpath)
            config = {"num_iterations": 12, "learning_rate": 1e-2, "some_boolean": False, "some_string": "some_value"}
            accelerator.init_trackers(project_name, config)
            accelerator.end_training()
            for child in Path(f"{dirpath}/{project_name}").glob("*/**"):
                log = list(filter(lambda x: x.is_file(), child.iterdir()))[0]
            assert str(log) != ""

    def test_log(self):
        project_name = "test_project_with_log"
        with tempfile.TemporaryDirectory() as dirpath:
            accelerator = Accelerator(log_with="tensorboard", project_dir=dirpath)
            accelerator.init_trackers(project_name)
            values = {"total_loss": 0.1, "iteration": 1, "my_text": "some_value"}
            accelerator.log(values, step=0)
            accelerator.end_training()
            # Logged values are stored in the outermost-tfevents file and can be read in as a TFRecord
            # Names are randomly generated each time
            log = list(filter(lambda x: x.is_file(), Path(f"{dirpath}/{project_name}").iterdir()))[0]
            assert str(log) != ""

    def test_log_with_tensor(self):
        project_name = "test_project_with_log"
        with tempfile.TemporaryDirectory() as dirpath:
            accelerator = Accelerator(log_with="tensorboard", project_dir=dirpath)
            accelerator.init_trackers(project_name)
            values = {"tensor": torch.tensor(1)}
            accelerator.log(values, step=0)
            accelerator.end_training()
            # Logged values are stored in the outermost-tfevents file and can be read in as a TFRecord
            # Names are randomly generated each time
            log = list(filter(lambda x: x.is_file(), Path(f"{dirpath}/{project_name}").iterdir()))[0]
            # Reading implementation based on https://github.com/pytorch/pytorch/issues/45327#issuecomment-703757685
            with open(log, "rb") as f:
                data = f.read()
            found_tensor = False
            while data:
                header = struct.unpack("Q", data[:8])

                event_str = data[12 : 12 + int(header[0])]  # 8+4
                data = data[12 + int(header[0]) + 4 :]
                event = event_pb2.Event()

                event.ParseFromString(event_str)
                if event.HasField("summary"):
                    for value in event.summary.value:
                        if value.simple_value == 1.0 and value.tag == "tensor":
                            found_tensor = True
            assert found_tensor, "Converted tensor was not found in the log file!"

    def test_project_dir(self):
        with self.assertRaisesRegex(ValueError, "Logging with `tensorboard` requires a `logging_dir`"):
            _ = Accelerator(log_with="tensorboard")
        with tempfile.TemporaryDirectory() as dirpath:
            _ = Accelerator(log_with="tensorboard", project_dir=dirpath)

    def test_project_dir_with_config(self):
        config = ProjectConfiguration(total_limit=30)
        with tempfile.TemporaryDirectory() as dirpath:
            _ = Accelerator(log_with="tensorboard", project_dir=dirpath, project_config=config)


@require_wandb
@mock.patch.dict(os.environ, {"WANDB_MODE": "offline"})
class WandBTrackingTest(TempDirTestCase, MockingTestCase):
    def setUp(self):
        super().setUp()
        # wandb let's us override where logs are stored to via the WANDB_DIR env var
        self.add_mocks(mock.patch.dict(os.environ, {"WANDB_DIR": self.tmpdir}))

    @staticmethod
    def parse_log(log: str, section: str, record: bool = True):
        """
        Parses wandb log for `section` and returns a dictionary of
        all items in that section. Section names are based on the
        output of `wandb sync --view --verbose` and items starting
        with "Record" in that result
        """
        # Big thanks to the W&B team for helping us parse their logs
        pattern = rf"{section} ([\S\s]*?)\n\n"
        if record:
            pattern = rf"Record: {pattern}"
        cleaned_record = re.findall(pattern, log)[0]
        # A config
        if section == "config" or section == "history":
            cleaned_record = re.findall(r'"([a-zA-Z0-9_.,]+)', cleaned_record)
            return {key: val for key, val in zip(cleaned_record[0::2], cleaned_record[1::2])}
        # Everything else
        else:
            return dict(re.findall(r'(\w+): "([^\s]+)"', cleaned_record))

    @skip
    def test_wandb(self):
        project_name = "test_project_with_config"
        accelerator = Accelerator(log_with="wandb")
        config = {"num_iterations": 12, "learning_rate": 1e-2, "some_boolean": False, "some_string": "some_value"}
        kwargs = {"wandb": {"tags": ["my_tag"]}}
        accelerator.init_trackers(project_name, config, kwargs)
        values = {"total_loss": 0.1, "iteration": 1, "my_text": "some_value"}
        accelerator.log(values, step=0)
        accelerator.end_training()
        # The latest offline log is stored at wandb/latest-run/*.wandb
        for child in Path(f"{self.tmpdir}/wandb/latest-run").glob("*"):
            if child.is_file() and child.suffix == ".wandb":
                cmd = ["wandb", "sync", "--view", "--verbose", str(child)]
                content = subprocess.check_output(cmd, encoding="utf8", errors="ignore")
                break

        # Check HPS through careful parsing and cleaning
        logged_items = self.parse_log(content, "config")
        assert logged_items["num_iterations"] == "12"
        assert logged_items["learning_rate"] == "0.01"
        assert logged_items["some_boolean"] == "false"
        assert logged_items["some_string"] == "some_value"
        assert logged_items["some_string"] == "some_value"

        # Run tags
        logged_items = self.parse_log(content, "run", False)
        assert logged_items["tags"] == "my_tag"

        # Actual logging
        logged_items = self.parse_log(content, "history")
        assert logged_items["total_loss"] == "0.1"
        assert logged_items["iteration"] == "1"
        assert logged_items["my_text"] == "some_value"
        assert logged_items["_step"] == "0"


@require_mlflow
class MLflowTrackingTest(unittest.TestCase):
    def setUp(self):
        import mlflow

        self.tmpdir = tempfile.TemporaryDirectory()
        mlflow.set_tracking_uri("file://" + self.tmpdir.name)

    @require_matplotlib
    def create_mock_figure(self):
        """Create a mock figure for testing."""
        import matplotlib.pyplot as plt

        fig = plt.figure(figsize=(6, 4))
        return fig

    def test_log(self):
        import mlflow

        """Test that log calls mlflow.log_metrics with only numeric values and the correct step."""
        values = {"accuracy": 0.95, "loss": 0.1, "non_numeric": "ignored"}
        tracker = MLflowTracker(experiment_name="test_exp", logging_dir=self.tmpdir.name)
        accelerator = Accelerator(log_with=tracker)
        accelerator.init_trackers(project_name="test_exp")
        tracker.log(values, step=10)

        run_id = tracker.active_run.info.run_id
        accelerator.end_training()

        # Retrieve the run and check the logged metrics.
        run = mlflow.get_run(run_id)
        metrics = run.data.metrics
        self.assertEqual(metrics.get("accuracy"), 0.95)
        self.assertEqual(metrics.get("loss"), 0.1)
        self.assertNotIn("non_numeric", metrics)

    @require_matplotlib
    def test_log_figure(self):
        import mlflow

        """Test that log_figure calls mlflow.log_figure with the correct arguments."""
        dummy_figure = self.create_mock_figure()
        tracker = MLflowTracker(experiment_name="test_exp", logging_dir=self.tmpdir.name)
        accelerator = Accelerator(log_with=tracker)
        accelerator.init_trackers(project_name="test_exp")
        tracker.log_figure(dummy_figure, artifact_file="dummy_figure.png")

        run_id = tracker.active_run.info.run_id
        accelerator.end_training()

        self.assertIn(
            "dummy_figure.png",
            [artifact.path for artifact in mlflow.artifacts.list_artifacts(run_id=run_id)],
        )

    def test_log_artifact(self):
        import mlflow

        """Test that log_artifact calls mlflow.log_artifact with the correct file path."""
        dummy_file_path = os.path.join(self.tmpdir.name, "dummy.txt")
        with open(dummy_file_path, "w") as f:
            f.write("dummy content")
        tracker = MLflowTracker(experiment_name="test_exp", logging_dir=self.tmpdir.name)
        accelerator = Accelerator(log_with=tracker)
        accelerator.init_trackers(project_name="test_exp")
        tracker.log_artifact(dummy_file_path, artifact_path="artifact_dir")

        run_id = tracker.active_run.info.run_id
        accelerator.end_training()

        self.assertIn(
            "artifact_dir/dummy.txt",
            [
                artifact.path
                for artifact in mlflow.artifacts.list_artifacts(run_id=run_id, artifact_path="artifact_dir")
            ],
        )

    def test_log_artifacts(self):
        import mlflow

        """Test that log_artifacts calls mlflow.log_artifacts with the correct directory."""
        dummy_dir = os.path.join(self.tmpdir.name, "dummy_dir")
        os.mkdir(dummy_dir)
        dummy_file_path = os.path.join(dummy_dir, "dummy.txt")
        with open(dummy_file_path, "w") as f:
            f.write("dummy content")
        tracker = MLflowTracker(experiment_name="test_exp", logging_dir=self.tmpdir.name)
        accelerator = Accelerator(log_with=tracker)
        accelerator.init_trackers(project_name="test_exp")
        tracker.log_artifacts(dummy_dir, artifact_path="artifact_dir")

        run_id = tracker.active_run.info.run_id
        accelerator.end_training()

        self.assertIn(
            "artifact_dir/dummy.txt",
            [
                artifact.path
                for artifact in mlflow.artifacts.list_artifacts(run_id=run_id, artifact_path="artifact_dir")
            ],
        )


@require_comet_ml
class CometMLTest(unittest.TestCase):
    @staticmethod
    def get_value_from_key(log_list, key: str, is_param: bool = False):
        "Extracts `key` from Comet `log`"
        for log in log_list:
            j = json.loads(log)["payload"]
            if is_param and "param" in j.keys():
                if j["param"]["paramName"] == key:
                    return j["param"]["paramValue"]
            if "log_other" in j.keys():
                if j["log_other"]["key"] == key:
                    return j["log_other"]["val"]
            if "metric" in j.keys():
                if j["metric"]["metricName"] == key:
                    return j["metric"]["metricValue"]
            if j.get("key", None) == key:
                return j["value"]

    def test_init_trackers(self):
        with tempfile.TemporaryDirectory() as d:
            tracker = CometMLTracker(
                "test_project_with_config", online=False, experiment_config=ExperimentConfig(offline_directory=d)
            )
            accelerator = Accelerator(log_with=tracker)
            config = {"num_iterations": 12, "learning_rate": 1e-2, "some_boolean": False, "some_string": "some_value"}
            accelerator.init_trackers(None, config)
            accelerator.end_training()
            log = os.listdir(d)[0]  # Comet is nice, it's just a zip file here
            # We parse the raw logs
            p = os.path.join(d, log)
            archive = zipfile.ZipFile(p, "r")
            log = archive.open("messages.json").read().decode("utf-8")
        list_of_json = log.split("\n")[:-1]
        assert self.get_value_from_key(list_of_json, "num_iterations", True) == 12
        assert self.get_value_from_key(list_of_json, "learning_rate", True) == 0.01
        assert self.get_value_from_key(list_of_json, "some_boolean", True) is False
        assert self.get_value_from_key(list_of_json, "some_string", True) == "some_value"

    def test_log(self):
        with tempfile.TemporaryDirectory() as d:
            tracker = CometMLTracker(
                "test_project_with_config", online=False, experiment_config=ExperimentConfig(offline_directory=d)
            )
            accelerator = Accelerator(log_with=tracker)
            accelerator.init_trackers(None)
            values = {"total_loss": 0.1, "iteration": 1, "my_text": "some_value"}
            accelerator.log(values, step=0)
            accelerator.end_training()
            log = os.listdir(d)[0]  # Comet is nice, it's just a zip file here
            # We parse the raw logs
            p = os.path.join(d, log)
            archive = zipfile.ZipFile(p, "r")
            log = archive.open("messages.json").read().decode("utf-8")
        list_of_json = log.split("\n")[:-1]
        assert self.get_value_from_key(list_of_json, "curr_step", True) == 0
        assert self.get_value_from_key(list_of_json, "total_loss") == 0.1
        assert self.get_value_from_key(list_of_json, "iteration") == 1
        assert self.get_value_from_key(list_of_json, "my_text") == "some_value"


@require_clearml
class ClearMLTest(TempDirTestCase, MockingTestCase):
    def setUp(self):
        super().setUp()
        # ClearML offline session location is stored in CLEARML_CACHE_DIR
        self.add_mocks(mock.patch.dict(os.environ, {"CLEARML_CACHE_DIR": str(self.tmpdir)}))

    @staticmethod
    def _get_offline_dir(accelerator):
        from clearml.config import get_offline_dir

        return get_offline_dir(task_id=accelerator.get_tracker("clearml", unwrap=True).id)

    @staticmethod
    def _get_metrics(offline_dir):
        metrics = []
        with open(os.path.join(offline_dir, "metrics.jsonl")) as f:
            json_lines = f.readlines()
            for json_line in json_lines:
                metrics.extend(json.loads(json_line))
        return metrics

    def test_init_trackers(self):
        from clearml import Task
        from clearml.utilities.config import text_to_config_dict

        Task.set_offline(True)
        accelerator = Accelerator(log_with="clearml")
        config = {"num_iterations": 12, "learning_rate": 1e-2, "some_boolean": False, "some_string": "some_value"}
        accelerator.init_trackers("test_project_with_config", config)

        offline_dir = ClearMLTest._get_offline_dir(accelerator)
        accelerator.end_training()

        with open(os.path.join(offline_dir, "task.json")) as f:
            offline_session = json.load(f)
        clearml_offline_config = text_to_config_dict(offline_session["configuration"]["General"]["value"])
        assert config == clearml_offline_config

    def test_log(self):
        from clearml import Task

        Task.set_offline(True)
        accelerator = Accelerator(log_with="clearml")
        accelerator.init_trackers("test_project_with_log")
        values_with_iteration = {"should_be_under_train": 1, "eval_value": 2, "test_value": 3.1, "train_value": 4.1}
        accelerator.log(values_with_iteration, step=1)
        single_values = {"single_value_1": 1.1, "single_value_2": 2.2}
        accelerator.log(single_values)

        offline_dir = ClearMLTest._get_offline_dir(accelerator)
        accelerator.end_training()

        metrics = ClearMLTest._get_metrics(offline_dir)
        assert (len(values_with_iteration) + len(single_values)) == len(metrics)
        for metric in metrics:
            if metric["metric"] == "Summary":
                assert metric["variant"] in single_values
                assert metric["value"] == single_values[metric["variant"]]
            elif metric["metric"] == "should_be_under_train":
                assert metric["variant"] == "train"
                assert metric["iter"] == 1
                assert metric["value"] == values_with_iteration["should_be_under_train"]
            else:
                values_with_iteration_key = metric["variant"] + "_" + metric["metric"]
                assert values_with_iteration_key in values_with_iteration
                assert metric["iter"] == 1
                assert metric["value"] == values_with_iteration[values_with_iteration_key]

    def test_log_images(self):
        from clearml import Task

        Task.set_offline(True)
        accelerator = Accelerator(log_with="clearml")
        accelerator.init_trackers("test_project_with_log_images")

        base_image = np.eye(256, 256, dtype=np.uint8) * 255
        base_image_3d = np.concatenate((np.atleast_3d(base_image), np.zeros((256, 256, 2), dtype=np.uint8)), axis=2)
        images = {
            "base_image": base_image,
            "base_image_3d": base_image_3d,
        }
        accelerator.get_tracker("clearml").log_images(images, step=1)

        offline_dir = ClearMLTest._get_offline_dir(accelerator)
        accelerator.end_training()

        images_saved = Path(os.path.join(offline_dir, "data")).rglob("*.jpeg")
        assert len(list(images_saved)) == len(images)

    def test_log_table(self):
        from clearml import Task

        Task.set_offline(True)
        accelerator = Accelerator(log_with="clearml")
        accelerator.init_trackers("test_project_with_log_table")

        accelerator.get_tracker("clearml").log_table(
            "from lists with columns", columns=["A", "B", "C"], data=[[1, 3, 5], [2, 4, 6]]
        )
        accelerator.get_tracker("clearml").log_table("from lists", data=[["A2", "B2", "C2"], [7, 9, 11], [8, 10, 12]])
        offline_dir = ClearMLTest._get_offline_dir(accelerator)
        accelerator.end_training()

        metrics = ClearMLTest._get_metrics(offline_dir)
        assert len(metrics) == 2
        for metric in metrics:
            assert metric["metric"] in ("from lists", "from lists with columns")
            plot = json.loads(metric["plot_str"])
            if metric["metric"] == "from lists with columns":
                print(plot["data"][0])
                self.assertCountEqual(plot["data"][0]["header"]["values"], ["A", "B", "C"])
                self.assertCountEqual(plot["data"][0]["cells"]["values"], [[1, 2], [3, 4], [5, 6]])
            else:
                self.assertCountEqual(plot["data"][0]["header"]["values"], ["A2", "B2", "C2"])
                self.assertCountEqual(plot["data"][0]["cells"]["values"], [[7, 8], [9, 10], [11, 12]])

    @require_pandas
    def test_log_table_pandas(self):
        import pandas as pd
        from clearml import Task

        Task.set_offline(True)
        accelerator = Accelerator(log_with="clearml")
        accelerator.init_trackers("test_project_with_log_table_pandas")

        accelerator.get_tracker("clearml").log_table(
            "from df", dataframe=pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}), step=1
        )

        offline_dir = ClearMLTest._get_offline_dir(accelerator)
        accelerator.end_training()

        metrics = ClearMLTest._get_metrics(offline_dir)
        assert len(metrics) == 1
        assert metrics[0]["metric"] == "from df"
        plot = json.loads(metrics[0]["plot_str"])
        self.assertCountEqual(plot["data"][0]["header"]["values"], [["A"], ["B"], ["C"]])
        self.assertCountEqual(plot["data"][0]["cells"]["values"], [[1, 2], [3, 4], [5, 6]])


@require_swanlab
@mock.patch.dict(os.environ, {"SWANLAB_MODE": "local"})
class SwanLabTrackingTest(TempDirTestCase, MockingTestCase):
    def setUp(self):
        super().setUp()
        # Setting Path where SwanLab parsed log files are saved via the SWANLAB_LOG_DIR env var
        self.add_mocks(mock.patch.dict(os.environ, {"SWANLAB_LOG_DIR": self.tmpdir}))

    @skip
    def test_swanlab(self):
        # Disable hardware monitoring to prevent errors in test mode.
        import swanlab
        from swanlab.log.backup import BackupHandler
        from swanlab.log.backup.datastore import DataStore
        from swanlab.log.backup.models import ModelsParser

        swanlab.merge_settings(swanlab.Settings(hardware_monitor=False))
        # Start a fake training session.
        accelerator = Accelerator(log_with="swanlab")
        project_name = "test_project_with_config"
        experiment_name = "test"
        description = "test project for swanlab"
        tags = ["my_tag"]
        config = {
            "epochs": 10,
            "learning_rate": 0.01,
            "offset": 0.1,
        }
        kwargs = {
            "swanlab": {
                "experiment_name": experiment_name,
                "description": description,
                "tags": tags,
            }
        }
        accelerator.init_trackers(project_name, config, kwargs)
        record_metrics = []
        record_scalars = []
        record_images_count = 0
        record_logs = []
        for epoch in range(1, swanlab.config.epochs):
            acc = 1 - 2**-epoch - random.random() / epoch - 0.1
            loss = 2**-epoch + random.random() / epoch + 0.1
            ll = swanlab.log(
                {
                    "accuracy": acc,
                    "loss": loss,
                    "image": swanlab.Image(np.random.random((3, 3, 3))),
                },
                step=epoch,
            )
            log = f"epoch={epoch}, accuracy={acc}, loss={loss}"
            print(log)
            record_scalars.extend([acc, loss])
            record_images_count += 1
            record_logs.append(log)
            record_metrics.extend([x for _, x in ll.items()])
        accelerator.end_training()

        # Load latest offline log
        run_dir = swanlab.get_run().public.run_dir
        assert os.path.exists(run_dir) is True
        ds = DataStore()
        ds.open_for_scan(os.path.join(run_dir.__str__(), BackupHandler.BACKUP_FILE).__str__())
        with ModelsParser() as models_parser:
            for record in ds:
                if record is None:
                    continue
                models_parser.parse_record(record)
        header, project, experiment, logs, runtime, columns, scalars, medias, footer = models_parser.get_parsed()

        # test file header
        assert header.backup_type == "DEFAULT"

        # test project info
        assert project.name == project_name
        assert project.workspace is None
        assert project.public is None

        # test experiment info
        assert experiment.name is not None
        assert experiment.description == description
        assert experiment.tags == tags

        # test log record
        backup_logs = [log.message for log in logs]
        for record_log in record_logs:
            assert record_log in backup_logs, "Log not found in backup logs: " + record_log

        # test runtime info
        runtime_info = runtime.to_file_model(os.path.join(run_dir.__str__(), "files"))
        assert runtime_info.conda is None, "Not using conda, should be None"
        assert isinstance(runtime_info.requirements, str), "Requirements should be a string"
        assert isinstance(runtime_info.metadata, dict), "Metadata should be a dictionary"
        assert isinstance(runtime_info.config, dict), "Config should be a dictionary"
        for key in runtime_info.config:
            assert key in config, f"Config key {key} not found in original config"
            assert runtime_info.config[key]["value"] == config[key], (
                f"Config value for {key} does not match original value"
            )

        # test scalar
        assert len(scalars) + len(medias) == len(record_metrics), "Total metrics count does not match"
        backup_scalars = [
            metric.metric["data"]
            for metric in record_metrics
            if metric.column_info.chart_type.value.column_type == "FLOAT"
        ]
        assert len(backup_scalars) == len(scalars), "Total scalars count does not match"
        for scalar in backup_scalars:
            assert scalar in record_scalars, f"Scalar {scalar} not found in original scalars"
        backup_images = [
            metric for metric in record_metrics if metric.column_info.chart_type.value.column_type == "IMAGE"
        ]
        assert len(backup_images) == record_images_count, "Total images count does not match"


class MyCustomTracker(GeneralTracker):
    "Basic tracker that writes to a csv for testing"

    _col_names = [
        "total_loss",
        "iteration",
        "my_text",
        "learning_rate",
        "num_iterations",
        "some_boolean",
        "some_string",
    ]

    name = "my_custom_tracker"
    requires_logging_directory = False

    def __init__(self, dir: str, **kwargs):
        super().__init__(**kwargs)
        self.log_dir = dir
        self.f = None
        self.writer = None

    def start(self):
        if self.f is None:
            self.f = open(os.path.join(self.log_dir, "log.csv"), "w+")
            self.writer = csv.DictWriter(self.f, fieldnames=self._col_names)
            self.writer.writeheader()

    @property
    def tracker(self):
        return self.writer

    def store_init_configuration(self, values: dict):
        logger.info("Call init")
        self.writer.writerow(values)

    def log(self, values: dict, step: Optional[int]):
        logger.info("Call log")
        self.writer.writerow(values)

    def finish(self):
        self.f.close()


class CustomTrackerTestCase(unittest.TestCase):
    def test_init_trackers(self):
        with tempfile.TemporaryDirectory() as d:
            tracker = MyCustomTracker(d)
            accelerator = Accelerator(log_with=tracker)
            config = {"num_iterations": 12, "learning_rate": 1e-2, "some_boolean": False, "some_string": "some_value"}
            accelerator.init_trackers("Some name", config)
            accelerator.end_training()
            with open(f"{d}/log.csv") as f:
                data = csv.DictReader(f)
                data = next(data)
                truth = {
                    "total_loss": "",
                    "iteration": "",
                    "my_text": "",
                    "learning_rate": "0.01",
                    "num_iterations": "12",
                    "some_boolean": "False",
                    "some_string": "some_value",
                }
                assert data == truth

    def test_log(self):
        with tempfile.TemporaryDirectory() as d:
            tracker = MyCustomTracker(d)
            accelerator = Accelerator(log_with=tracker)
            accelerator.init_trackers("Some name")
            values = {"total_loss": 0.1, "iteration": 1, "my_text": "some_value"}
            accelerator.log(values, step=0)
            accelerator.end_training()
            with open(f"{d}/log.csv") as f:
                data = csv.DictReader(f)
                data = next(data)
                truth = {
                    "total_loss": "0.1",
                    "iteration": "1",
                    "my_text": "some_value",
                    "learning_rate": "",
                    "num_iterations": "",
                    "some_boolean": "",
                    "some_string": "",
                }
                assert data == truth


@require_dvclive
@mock.patch("dvclive.live.get_dvc_repo", return_value=None)
class DVCLiveTrackingTest(unittest.TestCase):
    def test_init_trackers(self, mock_repo):
        project_name = "test_project_with_config"
        with tempfile.TemporaryDirectory() as dirpath:
            accelerator = Accelerator(log_with="dvclive")
            config = {
                "num_iterations": 12,
                "learning_rate": 1e-2,
                "some_boolean": False,
                "some_string": "some_value",
            }
            init_kwargs = {"dvclive": {"dir": dirpath, "save_dvc_exp": False, "dvcyaml": None}}
            accelerator.init_trackers(project_name, config, init_kwargs)
            accelerator.end_training()
            live = accelerator.trackers[0].live
            params = load_yaml(live.params_file)
            assert params == config

    def test_log(self, mock_repo):
        project_name = "test_project_with_log"
        with tempfile.TemporaryDirectory() as dirpath:
            accelerator = Accelerator(log_with="dvclive", project_dir=dirpath)
            init_kwargs = {"dvclive": {"dir": dirpath, "save_dvc_exp": False, "dvcyaml": None}}
            accelerator.init_trackers(project_name, init_kwargs=init_kwargs)
            values = {"total_loss": 0.1, "iteration": 1, "my_text": "some_value"}
            # Log step 0
            accelerator.log(values)
            # Log step 1
            accelerator.log(values)
            # Log step 3 (skip step 2)
            accelerator.log(values, step=3)
            accelerator.end_training()
            live = accelerator.trackers[0].live
            logs, latest = parse_metrics(live)
            assert latest.pop("step") == 3
            assert latest == values
            scalars = os.path.join(live.plots_dir, Metric.subfolder)
            for val in values.keys():
                val_path = os.path.join(scalars, f"{val}.tsv")
                steps = [int(row["step"]) for row in logs[val_path]]
                assert steps == [0, 1, 3]


class TrackerDeferredInitializationTest(unittest.TestCase):
    """
    Tests tracker's deferred initialization via `start()` method, preventing
    premature `PartialState` access (and `torch.distributed` init) before
    `Accelerator` has configured the distributed environment, especially with
    `InitProcessGroupKwargs`.
    """

    @require_tensorboard
    def test_tensorboard_deferred_init(self):
        """Test that TensorBoard tracker initialization doesn't initialize distributed"""
        with tempfile.TemporaryDirectory() as temp_dir:
            PartialState._reset_state()
            tracker = TensorBoardTracker(run_name="test_tb", logging_dir=temp_dir)
            self.assertEqual(PartialState._shared_state, {})
            _ = Accelerator(log_with=tracker)
            self.assertNotEqual(PartialState._shared_state, {})

    @require_wandb
    def test_wandb_deferred_init(self):
        """Test that WandB tracker initialization doesn't initialize distributed"""
        PartialState._reset_state()
        tracker = WandBTracker(run_name="test_wandb")
        self.assertEqual(PartialState._shared_state, {})
        _ = Accelerator(log_with=tracker)
        self.assertNotEqual(PartialState._shared_state, {})

    @require_trackio
    def test_trackio_deferred_init(self):
        """Test that trackio tracker initialization doesn't initialize distributed"""
        PartialState._reset_state()
        tracker = TrackioTracker(run_name="test_trackio")
        self.assertEqual(PartialState._shared_state, {})
        _ = Accelerator(log_with=tracker)
        self.assertNotEqual(PartialState._shared_state, {})

    @require_comet_ml
    def test_comet_ml_deferred_init(self):
        """Test that CometML tracker initialization doesn't initialize distributed"""
        PartialState._reset_state()
        tracker = CometMLTracker(run_name="test_comet")
        self.assertEqual(PartialState._shared_state, {})
        _ = Accelerator(log_with=tracker)
        self.assertNotEqual(PartialState._shared_state, {})

    @require_aim
    def test_aim_deferred_init(self):
        """Test that Aim tracker initialization doesn't initialize distributed"""
        with tempfile.TemporaryDirectory() as temp_dir:
            PartialState._reset_state()
            tracker = AimTracker(run_name="test_aim", repo=temp_dir)
            self.assertEqual(PartialState._shared_state, {})
            _ = Accelerator(log_with=tracker)
            self.assertNotEqual(PartialState._shared_state, {})

    @require_mlflow
    def test_mlflow_deferred_init(self):
        """Test that MLflow tracker initialization doesn't initialize distributed"""
        with tempfile.TemporaryDirectory() as temp_dir:
            PartialState._reset_state()
            tracker = MLflowTracker(experiment_name="test_mlflow", logging_dir=temp_dir)
            self.assertEqual(PartialState._shared_state, {})
            _ = Accelerator(log_with=tracker)
            self.assertNotEqual(PartialState._shared_state, {})

    @require_clearml
    def test_clearml_deferred_init(self):
        """Test that ClearML tracker initialization doesn't initialize distributed"""
        PartialState._reset_state()
        tracker = ClearMLTracker(run_name="test_clearml")
        self.assertEqual(PartialState._shared_state, {})
        _ = Accelerator(log_with=tracker)
        self.assertNotEqual(PartialState._shared_state, {})

    @require_dvclive
    def test_dvclive_deferred_init(self):
        """Test that DVCLive tracker initialization doesn't initialize distributed"""
        with tempfile.TemporaryDirectory() as temp_dir:
            PartialState._reset_state()
            tracker = DVCLiveTracker(dir=temp_dir)
            self.assertEqual(PartialState._shared_state, {})
            _ = Accelerator(log_with=tracker)
            self.assertNotEqual(PartialState._shared_state, {})

    @require_swanlab
    def test_swanlab_deferred_init(self):
        """Test that SwanLab tracker initialization doesn't initialize distributed"""
        PartialState._reset_state()
        tracker = SwanLabTracker(run_name="test_swanlab")
        self.assertEqual(PartialState._shared_state, {})
        _ = Accelerator(log_with=tracker)
        self.assertNotEqual(PartialState._shared_state, {})


================================================
FILE: tests/test_utils.py
================================================
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import pickle
import tempfile
import unittest
import warnings
from collections import UserDict, namedtuple
from typing import NamedTuple, Optional
from unittest.mock import Mock, patch

import numpy as np
import pytest
import torch
from torch import nn

from accelerate.big_modeling import cpu_offload_with_hook
from accelerate.hooks import attach_align_device_hook, remove_hook_from_module
from accelerate.state import PartialState
from accelerate.test_utils.testing import (
    require_huggingface_suite,
    require_non_cpu,
    require_non_torch_xla,
    require_torch_min_version,
    require_tpu,
    require_triton,
    torch_device,
)
from accelerate.test_utils.training import RegressionModel
from accelerate.utils import (
    CannotPadNestedTensorWarning,
    check_os_kernel,
    clear_environment,
    concatenate,
    convert_dict_to_env_variables,
    convert_outputs_to_fp32,
    convert_to_fp32,
    extract_model_from_parallel,
    find_device,
    has_offloaded_params,
    is_torch_xla_available,
    listify,
    pad_across_processes,
    pad_input_tensors,
    patch_environment,
    purge_accelerate_environment,
    recursively_apply,
    save,
    send_to_device,
)
from accelerate.utils.operations import is_namedtuple


if is_torch_xla_available():
    import torch_xla.distributed.spmd as xs
    import torch_xla.runtime as xr
    from torch_xla.experimental.spmd_fully_sharded_data_parallel import SpmdFullyShardedDataParallel as FSDPv2

ExampleNamedTuple = namedtuple("ExampleNamedTuple", "a b c")


class UtilsTester(unittest.TestCase):
    def setUp(self):
        # logging requires initialized state
        PartialState()

    def test_send_to_device(self):
        tensor = torch.randn(5, 2)
        device = torch.device(f"{torch_device}:0")

        result1 = send_to_device(tensor, device)
        assert torch.equal(result1.cpu(), tensor)

        result2 = send_to_device((tensor, [tensor, tensor], 1), device)
        assert isinstance(result2, tuple)
        assert torch.equal(result2[0].cpu(), tensor)
        assert isinstance(result2[1], list)
        assert torch.equal(result2[1][0].cpu(), tensor)
        assert torch.equal(result2[1][1].cpu(), tensor)
        assert result2[2] == 1

        result2 = send_to_device({"a": tensor, "b": [tensor, tensor], "c": 1}, device)
        assert isinstance(result2, dict)
        assert torch.equal(result2["a"].cpu(), tensor)
        assert isinstance(result2["b"], list)
        assert torch.equal(result2["b"][0].cpu(), tensor)
        assert torch.equal(result2["b"][1].cpu(), tensor)
        assert result2["c"] == 1

        result3 = send_to_device(ExampleNamedTuple(a=tensor, b=[tensor, tensor], c=1), device)
        assert isinstance(result3, ExampleNamedTuple)
        assert torch.equal(result3.a.cpu(), tensor)
        assert isinstance(result3.b, list)
        assert torch.equal(result3.b[0].cpu(), tensor)
        assert torch.equal(result3.b[1].cpu(), tensor)
        assert result3.c == 1

        result4 = send_to_device(UserDict({"a": tensor, "b": [tensor, tensor], "c": 1}), device)
        assert isinstance(result4, UserDict)
        assert torch.equal(result4["a"].cpu(), tensor)
        assert isinstance(result4["b"], list)
        assert torch.equal(result4["b"][0].cpu(), tensor)
        assert torch.equal(result4["b"][1].cpu(), tensor)
        assert result4["c"] == 1

    def test_honor_type(self):
        with self.assertRaises(TypeError) as cm:
            _ = recursively_apply(torch.tensor, (torch.tensor(1), 1), error_on_other_type=True)
        assert (
            str(cm.exception)
            == "Unsupported types (<class 'int'>) passed to `tensor`. Only nested list/tuple/dicts of objects that are valid for `is_torch_tensor` should be passed."
        )

    def test_listify(self):
        tensor = torch.tensor([1, 2, 3, 4, 5])
        assert listify(tensor) == [1, 2, 3, 4, 5]

        tensor = torch.tensor([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])
        assert listify(tensor) == [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]

        tensor = torch.tensor([[[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]], [[11, 12, 13, 14, 15], [16, 17, 18, 19, 20]]])
        assert listify(tensor) == [[[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]], [[11, 12, 13, 14, 15], [16, 17, 18, 19, 20]]]

    def test_patch_environment(self):
        with patch_environment(aa=1, BB=2):
            assert os.environ.get("AA") == "1"
            assert os.environ.get("BB") == "2"

        assert "AA" not in os.environ
        assert "BB" not in os.environ

    def test_patch_environment_key_exists(self):
        # check that patch_environment correctly restores pre-existing env vars
        with patch_environment(aa=1, BB=2):
            assert os.environ.get("AA") == "1"
            assert os.environ.get("BB") == "2"

            with patch_environment(Aa=10, bb="20", cC=30):
                assert os.environ.get("AA") == "10"
                assert os.environ.get("BB") == "20"
                assert os.environ.get("CC") == "30"

            assert os.environ.get("AA") == "1"
            assert os.environ.get("BB") == "2"
            assert "CC" not in os.environ

        assert "AA" not in os.environ
        assert "BB" not in os.environ
        assert "CC" not in os.environ

    def test_patch_environment_restores_on_error(self):
        # we need to find an upper-case envvar
        # because `patch_environment upper-cases all keys...
        key, orig_value = next(kv for kv in os.environ.items() if kv[0].isupper())
        new_value = f"{orig_value}_foofoofoo"
        with pytest.raises(RuntimeError), patch_environment(**{key: new_value}):
            assert os.environ[key] == os.getenv(key) == new_value  # noqa: TID251
            raise RuntimeError("Oopsy daisy!")
        assert os.environ[key] == os.getenv(key) == orig_value  # noqa: TID251

    def test_clear_environment(self):
        key, value = os.environ.copy().popitem()
        with pytest.raises(RuntimeError), clear_environment():
            assert key not in os.environ
            assert not os.getenv(key)  # test the environment is actually cleared  # noqa: TID251
            raise RuntimeError("Oopsy daisy!")
        # Test values are restored
        assert os.getenv(key) == os.environ[key] == value  # noqa: TID251

    def test_can_undo_convert_outputs(self):
        model = RegressionModel()
        model._original_forward = model.forward
        model.forward = convert_outputs_to_fp32(model.forward)
        model = extract_model_from_parallel(model, keep_fp32_wrapper=False)
        _ = pickle.dumps(model)

    @require_non_cpu
    def test_can_undo_fp16_conversion(self):
        model = RegressionModel()
        model._original_forward = model.forward
        model.forward = torch.autocast(device_type=torch_device, dtype=torch.float16)(model.forward)
        model.forward = convert_outputs_to_fp32(model.forward)
        model = extract_model_from_parallel(model, keep_fp32_wrapper=False)
        _ = pickle.dumps(model)

    @require_triton
    @require_non_cpu
    def test_dynamo(self):
        model = RegressionModel()
        model._original_forward = model.forward
        model.forward = torch.autocast(device_type=torch_device, dtype=torch.float16)(model.forward)
        model.forward = convert_outputs_to_fp32(model.forward)
        model.forward = torch.compile(model.forward, backend="inductor")
        inputs = torch.randn(4, 10).to(torch_device)
        _ = model(inputs)

    def test_extract_model(self):
        model = RegressionModel()
        # could also do a test with DistributedDataParallel, but difficult to run on CPU or single GPU
        distributed_model = torch.nn.parallel.DataParallel(model)
        model_unwrapped = extract_model_from_parallel(distributed_model)

        assert model == model_unwrapped

    @require_tpu
    @require_huggingface_suite
    def test_extract_model_recursive_fsdpv2(self):
        # Specifically tests for FSDPv2 extraction
        # reported in https://github.com/huggingface/transformers/pull/29780
        xr.use_spmd()
        from transformers import AutoModelForCausalLM

        model = AutoModelForCausalLM.from_pretrained("gpt2")
        orig_state_dict_keys = list(model.state_dict().keys())
        num_devices = xr.global_runtime_device_count()
        # Set environment for FSDPv2 to be active
        xs.set_global_mesh(xs.Mesh(np.array(range(num_devices)), (num_devices, 1), axis_names=("fsdp", "tensor")))

        def nested_wrap(model):
            layer = model.wte
            wrapped_layer = FSDPv2(layer)
            model.wte = wrapped_layer
            return model

        wrapped_model = nested_wrap(model)
        unwrapped_model = extract_model_from_parallel(wrapped_model, recursive=True)
        unwrapped_state_dict_keys = list(unwrapped_model.state_dict().keys())
        for original_key, new_key in zip(orig_state_dict_keys, unwrapped_state_dict_keys):
            assert original_key == new_key, f"Keys did not align: {original_key} != {new_key}"

    def test_dynamo_extract_model_keep_torch_compile(self):
        model = RegressionModel()
        compiled_model = torch.compile(model)

        # could also do a test with DistributedDataParallel, but difficult to run on CPU or single GPU
        distributed_model = torch.nn.parallel.DataParallel(model)
        distributed_compiled_model = torch.compile(distributed_model)
        compiled_model_unwrapped = extract_model_from_parallel(distributed_compiled_model, keep_torch_compile=True)

        assert compiled_model._orig_mod == compiled_model_unwrapped._orig_mod

    def test_dynamo_extract_model_remove_torch_compile(self):
        model = RegressionModel()
        compiled_model = torch.compile(model)

        # could also do a test with DistributedDataParallel, but difficult to run on CPU or single GPU
        distributed_model = torch.nn.parallel.DataParallel(model)
        distributed_compiled_model = torch.compile(distributed_model)
        compiled_model_unwrapped = extract_model_from_parallel(distributed_compiled_model, keep_torch_compile=False)

        assert compiled_model._orig_mod == compiled_model_unwrapped

    def test_find_device(self):
        assert find_device([1, "a", torch.tensor([1, 2, 3])]) == torch.device("cpu")
        assert find_device({"a": 1, "b": torch.tensor([1, 2, 3])}) == torch.device("cpu")
        assert find_device([1, "a"]) is None

    def test_check_os_kernel_no_warning_when_release_gt_min(self):
        # min version is 5.5
        with patch("platform.uname", return_value=Mock(release="5.15.0-35-generic", system="Linux")):
            with warnings.catch_warnings(record=True) as w:
                check_os_kernel()
            assert len(w) == 0

    def test_check_os_kernel_no_warning_when_not_linux(self):
        # system must be Linux
        with patch("platform.uname", return_value=Mock(release="5.4.0-35-generic", system="Darwin")):
            with warnings.catch_warnings(record=True) as w:
                check_os_kernel()
            assert len(w) == 0

    def test_check_os_kernel_warning_when_release_lt_min(self):
        # min version is 5.5
        with patch("platform.uname", return_value=Mock(release="5.4.0-35-generic", system="Linux")):
            with self.assertLogs() as ctx:
                check_os_kernel()
            assert len(ctx.records) == 1
            assert ctx.records[0].levelname == "WARNING"
            assert "5.4.0" in ctx.records[0].msg
            assert "5.5.0" in ctx.records[0].msg

    @require_non_torch_xla
    def test_save_safetensor_shared_memory(self):
        class Model(nn.Module):
            def __init__(self):
                super().__init__()
                self.a = nn.Linear(100, 100)
                self.b = self.a

            def forward(self, x):
                return self.b(self.a(x))

        model = Model()
        with tempfile.TemporaryDirectory() as tmp_dir:
            save_path = os.path.join(tmp_dir, "model.safetensors")
            with self.assertLogs(level="WARNING") as log:
                save(model.state_dict(), save_path, safe_serialization=True)
                assert len(log.records) == 1
                assert "Removed shared tensor" in log.output[0]

    @require_torch_min_version(version="1.12")
    def test_pad_across_processes(self):
        from torch.nested import nested_tensor

        nt = nested_tensor([[1, 2, 3], [1], [1, 2]])
        with self.assertWarns(CannotPadNestedTensorWarning):
            nt2 = pad_across_processes(nt)
        assert nt is nt2

        # Basic functionality
        tensor = torch.randn(4, 3, 100)
        padded_tensor = pad_across_processes(tensor, dim=-1)
        assert padded_tensor.shape[-1] == 100

        # dim = -4 is out of bounds
        padded_tensor = pad_across_processes(tensor, dim=-4)
        assert padded_tensor is tensor

    def test_slice_and_concatenate(self):
        # First base case: 2 processes, batch size of 1
        num_processes = 2
        batch_size = 1
        batch = torch.rand(batch_size, 4)
        result = pad_input_tensors(batch, batch_size, num_processes)
        # We should expect there to be 2 items now
        assert result.shape == torch.Size([2, 4])

        # Second base case: 2 processes, batch size of 3
        num_processes = 2
        batch_size = 3
        batch = torch.rand(batch_size, 4)
        result = pad_input_tensors(batch, batch_size, num_processes)
        # We should expect there to be 4 items now
        assert result.shape == torch.Size([4, 4])

        # Third base case: 3 processes, batch size of 4
        num_processes = 3
        batch_size = 4
        batch = torch.rand(batch_size, 4, 4)
        result = pad_input_tensors(batch, batch_size, num_processes)
        # We should expect there to be 6 items now
        assert result.shape == torch.Size([6, 4, 4])

        # Fourth base case: 4 processes, batch size of 3
        num_processes = 4
        batch_size = 3
        batch = torch.rand(batch_size, 4, 4)
        result = pad_input_tensors(batch, batch_size, num_processes)
        # We should expect there to be 4 items now
        assert result.shape == torch.Size([4, 4, 4])

        # Fifth base case: 6 processes, batch size of 4
        num_processes = 6
        batch_size = 4
        batch = torch.rand(batch_size, 4, 4)
        result = pad_input_tensors(batch, batch_size, num_processes)
        # We should expect there to be 6 items now
        assert result.shape == torch.Size([6, 4, 4])

        # Sixth base case: 6 processes, batch size of 1
        num_processes = 6
        batch_size = 1
        batch = torch.rand(batch_size, 4, 4)
        result = pad_input_tensors(batch, batch_size, num_processes)
        # We should expect there to be 6 items now
        assert result.shape == torch.Size([6, 4, 4])

        # Seventh base case: 6 processes, batch size of 2
        num_processes = 6
        batch_size = 2
        batch = torch.rand(batch_size, 4, 4)
        result = pad_input_tensors(batch, batch_size, num_processes)
        # We should expect there to be 6 items now
        assert result.shape == torch.Size([6, 4, 4])

        # Eighth base case: 6 processes, batch size of 61
        num_processes = 6
        batch_size = 61
        batch = torch.rand(batch_size, 4, 4)
        result = pad_input_tensors(batch, batch_size, num_processes)
        # We should expect there to be 66 items now
        assert result.shape == torch.Size([66, 4, 4])

    def test_send_to_device_compiles(self):
        compiled_send_to_device = torch.compile(send_to_device, fullgraph=True)
        compiled_send_to_device(torch.zeros([1], dtype=torch.bfloat16), "cpu")

    def test_convert_to_fp32(self):
        compiled_convert_to_fp32 = torch.compile(convert_to_fp32, fullgraph=True)
        compiled_convert_to_fp32(torch.zeros([1], dtype=torch.bfloat16))

    def test_named_tuples(self):
        class QuantTensorBase(NamedTuple):
            value: torch.Tensor
            scale: Optional[torch.Tensor]
            zero_point: Optional[torch.Tensor]

        class Second(QuantTensorBase):
            pass

        a = QuantTensorBase(torch.tensor(1.0), None, None)
        b = Second(torch.tensor(1.0), None, None)

        point = namedtuple("Point", ["x", "y"])
        p = point(11, y=22)

        self.assertTrue(is_namedtuple(a))
        self.assertTrue(is_namedtuple(b))
        self.assertTrue(is_namedtuple(p))
        self.assertFalse(is_namedtuple((1, 2)))
        self.assertFalse(is_namedtuple("hey"))
        self.assertFalse(is_namedtuple(object()))

    def test_convert_dict_to_env_variables(self):
        env = {"ACCELERATE_DEBUG_MODE": "1", "BAD_ENV_NAME": "<mything", "OTHER_ENV": "2"}
        with self.assertLogs("accelerate.utils.environment", level="WARNING"):
            valid_env_items = convert_dict_to_env_variables(env)
        assert valid_env_items == ["ACCELERATE_DEBUG_MODE=1\n", "OTHER_ENV=2\n"]

    def test_has_offloaded_params(self):
        model = RegressionModel()
        assert not has_offloaded_params(model)

        attach_align_device_hook(model, offload=False)
        assert not has_offloaded_params(model)

        remove_hook_from_module(model)
        model, _ = cpu_offload_with_hook(model)
        assert not has_offloaded_params(model)

        remove_hook_from_module(model)
        attach_align_device_hook(model, offload=True)
        assert has_offloaded_params(model)

    def test_concatenate(self):
        tensor1 = torch.randn(2, 3)
        tensor2 = torch.randn(2, 3)
        result = concatenate([tensor1, tensor2])
        assert result.shape == torch.Size([4, 3])
        assert torch.equal(result[:2], tensor1)
        assert torch.equal(result[2:], tensor2)

        single_tensor = torch.randn(3, 4)
        result = concatenate([single_tensor])
        assert result.shape == torch.Size([3, 4])
        assert torch.equal(result, single_tensor)

        # NOTE: We return as-is if there's just a single batch of data, even if it's not a tensor
        single_value = "test_string"
        result = concatenate([single_value])
        assert result == single_value

        data = [
            [torch.randn(2, 3), torch.randn(2, 4)],
            [torch.randn(2, 3), torch.randn(2, 4)],
        ]
        result = concatenate(data)
        assert isinstance(result, list)
        assert len(result) == 2
        assert result[0].shape == torch.Size([4, 3])
        assert result[1].shape == torch.Size([4, 4])

        data = [
            (torch.randn(2, 3), torch.randn(2, 4)),
            (torch.randn(2, 3), torch.randn(2, 4)),
        ]
        result = concatenate(data)
        assert isinstance(result, tuple)
        assert len(result) == 2
        assert result[0].shape == torch.Size([4, 3])
        assert result[1].shape == torch.Size([4, 4])

        data = [
            {"a": torch.randn(2, 3), "b": torch.randn(2, 4)},
            {"a": torch.randn(2, 3), "b": torch.randn(2, 4)},
        ]
        result = concatenate(data)
        assert isinstance(result, dict)
        assert "a" in result and "b" in result
        assert result["a"].shape == torch.Size([4, 3])
        assert result["b"].shape == torch.Size([4, 4])

        # NOTE: We can't merge multiple batches of non-tensor data
        data = [
            {"a": torch.randn(2, 3), "b": torch.randn(2, 4), "c": "test_string1"},
            {"a": torch.randn(2, 3), "b": torch.randn(2, 4), "c": "test_string2"},
        ]
        with self.assertRaises(TypeError):
            result = concatenate(data)

        batch1 = torch.randn(5, 10)
        batch2 = torch.randn(5, 10)
        batch3 = torch.randn(5, 10)
        result = concatenate([batch1, batch2, batch3])
        assert result.shape == torch.Size([15, 10])
        assert torch.equal(result[:5], batch1)
        assert torch.equal(result[5:10], batch2)
        assert torch.equal(result[10:], batch3)

        # NOTE: We can't merge misaligned batches, the torch.cat will raise a RuntimeError
        batch1 = torch.randn(5, 10)
        batch2 = torch.randn(5, 12)
        with self.assertRaises(RuntimeError):
            result = concatenate([batch1, batch2])

        tensor1 = torch.randn(3, 2, 4)
        tensor2 = torch.randn(3, 2, 4)
        result = concatenate([tensor1, tensor2], dim=1)
        assert result.shape == torch.Size([3, 4, 4])

        data = [
            {"inputs": [torch.randn(2, 3), torch.randn(2, 4)], "labels": torch.randn(2, 1)},
            {"inputs": [torch.randn(2, 3), torch.randn(2, 4)], "labels": torch.randn(2, 1)},
            {"inputs": [torch.randn(2, 3), torch.randn(2, 4)], "labels": torch.randn(2, 1)},
        ]
        result = concatenate(data)
        assert isinstance(result, dict)
        assert isinstance(result["inputs"], list)
        assert result["inputs"][0].shape == torch.Size([6, 3])
        assert result["inputs"][1].shape == torch.Size([6, 4])
        assert result["labels"].shape == torch.Size([6, 1])


def set_dummy_accelerate_env_var():
    """Set an accelerate env var

    This class emulates the behavior of, for instance, transformers.TrainingArguments, which is allowed to set
    accelerate env vars but does not clean them up. E.g.

    TrainingArguments(fp16=True, output_dir="/tmp/test")

    leaves ACCELERATE_MIXED_PRECISION=fp16 as an env var.
    """
    os.environ["ACCELERATE_SOME_ENV_VAR"] = "true"


@purge_accelerate_environment
class MyUnittest(unittest.TestCase):
    def test_purge_env_vars_unittest_1(self):
        os.environ.pop("ACCELERATE_SOME_ENV_VAR", None)
        set_dummy_accelerate_env_var()
        assert "ACCELERATE_SOME_ENV_VAR" in os.environ

    def test_purge_env_vars_unittest_2(self):
        assert "ACCELERATE_SOME_ENV_VAR" not in os.environ


@unittest.skipIf(False, "dummy unittest wrapper")
@purge_accelerate_environment
@unittest.skipUnless(True, "dummy unittest wrapper")
class MyUnittestWithDecorators(unittest.TestCase):
    def test_purge_env_vars_unittest_with_wrapper_1(self):
        os.environ.pop("ACCELERATE_SOME_ENV_VAR", None)
        set_dummy_accelerate_env_var()
        assert "ACCELERATE_SOME_ENV_VAR" in os.environ

    def test_purge_env_vars_unittest_with_wrapper_2(self):
        assert "ACCELERATE_SOME_ENV_VAR" not in os.environ

    @unittest.skipIf(False, "dummy unittest wrapper")
    def test_purge_env_vars_unittest_with_wrapper_3(self):
        assert "ACCELERATE_SOME_ENV_VAR" not in os.environ

    @unittest.skipIf(True, "this is always skipped")
    def test_purge_env_vars_unittest_with_wrapper_4(self):
        # ensure that unittest markers still do their job
        assert False


@purge_accelerate_environment
class _BaseCls(unittest.TestCase):
    def test_purge_env_vars_unittest_with_inheritance_3(self):
        assert "ACCELERATE_SOME_ENV_VAR" not in os.environ


class MyUnittestWithInheritance(_BaseCls):
    def test_purge_env_vars_unittest_with_inheritance_1(self):
        os.environ.pop("ACCELERATE_SOME_ENV_VAR", None)
        set_dummy_accelerate_env_var()
        assert "ACCELERATE_SOME_ENV_VAR" in os.environ

    def test_purge_env_vars_unittest_with_inheritance_2(self):
        assert "ACCELERATE_SOME_ENV_VAR" not in os.environ


@purge_accelerate_environment
class TestMyPytest:
    def test_purge_env_vars_pytest_1(self):
        os.environ.pop("ACCELERATE_SOME_ENV_VAR", None)
        set_dummy_accelerate_env_var()
        assert "ACCELERATE_SOME_ENV_VAR" in os.environ

    def test_purge_env_vars_pytest_2(self):
        assert "ACCELERATE_SOME_ENV_VAR" not in os.environ


@pytest.fixture
def dummy_fixture():
    pass


@pytest.mark.skipif(False, reason="dummy pytest wrapper")
@pytest.mark.usefixtures("dummy_fixture")
@purge_accelerate_environment
@pytest.mark.skipif(False, reason="dummy pytest wrapper")
@pytest.mark.usefixtures("dummy_fixture")
class TestPytestWithWrapper:
    def test_purge_env_vars_pytest_with_wrapper_1(self):
        os.environ.pop("ACCELERATE_SOME_ENV_VAR", None)
        set_dummy_accelerate_env_var()
        assert "ACCELERATE_SOME_ENV_VAR" in os.environ

    def test_purge_env_vars_pytest_with_wrapper_2(self):
        assert "ACCELERATE_SOME_ENV_VAR" not in os.environ

    @pytest.mark.skipif(False, reason="dummy pytest wrapper")
    @pytest.mark.usefixtures("dummy_fixture")
    def test_purge_env_vars_pytest_with_wrapper_3(self):
        assert "ACCELERATE_SOME_ENV_VAR" not in os.environ

    @pytest.mark.skipif(True, reason="this is always skipped")
    def test_purge_env_vars_pytest_with_wrapper_4_should_be_skipped(self):
        # ensure that pytest markers still do their job
        assert False


@purge_accelerate_environment
class _PytestBaseCls:
    def test_purge_env_vars_pytest_with_inheritance_3(self):
        assert "ACCELERATE_SOME_ENV_VAR" not in os.environ


class TestPytestWithInheritance(_PytestBaseCls):
    def test_purge_env_vars_pytest_with_inheritance_1(self):
        os.environ.pop("ACCELERATE_SOME_ENV_VAR", None)
        set_dummy_accelerate_env_var()
        assert "ACCELERATE_SOME_ENV_VAR" in os.environ

    def test_purge_env_vars_pytest_with_inheritance_2(self):
        assert "ACCELERATE_SOME_ENV_VAR" not in os.environ


@purge_accelerate_environment
def test_purge_env_vars_standalone_1():
    os.environ.pop("ACCELERATE_SOME_ENV_VAR", None)
    set_dummy_accelerate_env_var()
    assert "ACCELERATE_SOME_ENV_VAR" in os.environ


def test_purge_env_vars_standalone_2():
    assert "ACCELERATE_SOME_ENV_VAR" not in os.environ


def test_purge_env_vars_restores_previous_values():
    # Ensure that purge_accelerate_environment restores values of previous accelerate env vars and does not delete
    # untouched env vars.
    @purge_accelerate_environment
    def dummy_func():
        os.environ["ACCELERATE_SOME_ENV_VAR"] = "456"

    os.environ["ACCELERATE_SOME_ENV_VAR"] = "1"
    os.environ["ACCELERATE_ANOTHER_ENV_VAR"] = "2"

    dummy_func()

    assert os.environ["ACCELERATE_SOME_ENV_VAR"] == "1"
    assert os.environ["ACCELERATE_ANOTHER_ENV_VAR"] == "2"

    del os.environ["ACCELERATE_SOME_ENV_VAR"]
    del os.environ["ACCELERATE_ANOTHER_ENV_VAR"]


================================================
FILE: tests/tp/fsdp2_tp_preparation.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from datetime import timedelta

import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer

from accelerate import Accelerator, InitProcessGroupKwargs
from accelerate.parallelism_config import ParallelismConfig
from accelerate.utils import FullyShardedDataParallelPlugin


class LmHeadWrapper(torch.nn.Module):
    def __init__(self, lm_head):
        super().__init__()
        self.lm_head = lm_head

    def forward(self, x):
        return self.lm_head(x)


def build_simple_dataloader(tokenizer, seq_len=64, batch_size=2):
    """Build a simple dataloader for reproduction."""
    # Load small dataset
    raw = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:1%]")
    raw = raw.filter(lambda x: len(tokenizer(x["text"])["input_ids"]) > 0)
    raw = raw.select(range(min(100, len(raw))))  # Use only 100 samples

    def tok_fn(examples):
        return tokenizer(examples["text"], truncation=True, max_length=seq_len)

    ds = raw.map(tok_fn, batched=True, remove_columns=["text"])
    ds.set_format(type="torch", columns=["input_ids"])

    def collate(batch):
        ids = [b["input_ids"] for b in batch]
        labels = [x.clone() for x in ids]
        pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id
        x = torch.nn.utils.rnn.pad_sequence(ids, batch_first=True, padding_value=pad_id)
        y = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)
        return {"input_ids": x, "labels": y}

    return DataLoader(ds, batch_size=batch_size, shuffle=False, collate_fn=collate)


def main():
    # Configuration
    MODEL_NAME = "Qwen/Qwen3-0.6B"
    BATCH_SIZE = 2
    SEQ_LEN = 64
    TP = 2
    DP = 4 // TP

    # Setup Accelerator with FSDP2
    init_kwargs = InitProcessGroupKwargs(timeout=timedelta(seconds=1800))
    pc = ParallelismConfig(dp_shard_size=DP, tp_size=TP)

    fsdp_plugin = FullyShardedDataParallelPlugin(
        fsdp_version=2,
        reshard_after_forward=True,
        auto_wrap_policy="transformer_based_wrap",
        state_dict_type="SHARDED_STATE_DICT",
        activation_checkpointing=False,
        cpu_ram_efficient_loading=True,
    )

    accelerator = Accelerator(kwargs_handlers=[init_kwargs], parallelism_config=pc, fsdp_plugin=fsdp_plugin)

    rank = accelerator.process_index
    print(f"[Rank {rank}] Initializing...")

    # Load model with TP if needed
    model_kwargs = {"tp_size": TP, "tp_plan": "auto", "device_mesh": accelerator.torch_device_mesh} if TP > 1 else {}

    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, use_cache=False, **model_kwargs)

    model.lm_head = LmHeadWrapper(model.lm_head)

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

    print(f"[Rank {rank}] Building dataloader...")
    loader = build_simple_dataloader(tokenizer, seq_len=SEQ_LEN, batch_size=BATCH_SIZE)

    print(f"[Rank {rank}] Preparing with accelerator...")
    # ERROR OCCURS HERE AT LINE 110 in original script
    model, optimizer, loader = accelerator.prepare(model, optimizer, loader)

    print(f"[Rank {rank}] Preparation successful!")


if __name__ == "__main__":
    main()


================================================
FILE: tests/tp/fsdp2_tp_preparation_config.yaml
================================================
# FSDP2 Single Node Configuration
# Status: CURRENT - Recommended for new single-node usage

compute_environment: LOCAL_MACHINE
debug: false
distributed_type: FSDP
downcast_bf16: 'no'
machine_rank: 0
main_training_function: main
mixed_precision: 'no'
num_machines: 1
num_processes: 4  # Adjust for your GPU count
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false

================================================
FILE: tests/tp/test_tp.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import os

from accelerate.test_utils.testing import (
    TempDirTestCase,
    execute_subprocess_async,
    get_launch_command,
    path_in_accelerate_package,
    require_multi_device,
    require_non_torch_xla,
    require_tp,
    require_transformers,
    run_first,
    slow,
)
from accelerate.utils import patch_environment


@require_non_torch_xla
@require_multi_device
@require_transformers
@require_tp
@run_first
@slow
class TPIntegrationTest(TempDirTestCase):
    test_scripts_folder = path_in_accelerate_package("test_utils", "scripts", "external_deps")

    def setUp(self):
        super().setUp()
        self.test_tp_size = 2
        self.model_name_or_path = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
        self.batch_size = 1
        from accelerate.utils import set_seed

        set_seed(42)

    def test_working_of_tp(self):
        self.test_file_path = self.test_scripts_folder / "test_performance.py"
        cmd = get_launch_command(num_processes=self.test_tp_size, num_machines=1, machine_rank=0)
        cmd.extend(
            [
                self.test_file_path,
                f"--output_dir={self.tmpdir}",
                f"--model_name_or_path={self.model_name_or_path}",
                "--add_pad_token=true",
                "--tp_plan=auto",
                f"--tp_size={self.test_tp_size}",
            ]
        )
        with patch_environment(omp_num_threads=1):
            execute_subprocess_async(cmd)

    def test_working_of_tp_and_fsdp(self):
        current_dir = os.path.dirname(os.path.abspath(__file__))
        self.test_file_path = os.path.join(current_dir, "fsdp2_tp_preparation.py")
        self.test_config_path = os.path.join(current_dir, "fsdp2_tp_preparation_config.yaml")
        cmd = get_launch_command()
        cmd.extend(
            [
                f"--config_file={self.test_config_path}",
                self.test_file_path,
            ]
        )
        with patch_environment(omp_num_threads=4):
            execute_subprocess_async(cmd)


================================================
FILE: tests/xla_spawn.py
================================================
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
A simple launcher script for TPU training

Inspired by https://github.com/pytorch/pytorch/blob/master/torch/distributed/launch.py

::
    >>> python xla_spawn.py --num_cores=NUM_CORES_YOU_HAVE
               YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 and all other
               arguments of your training script)

"""

import importlib
import sys
from argparse import REMAINDER, ArgumentParser
from pathlib import Path

import torch_xla.distributed.xla_multiprocessing as xmp
from torch_xla import device_count


def parse_args():
    """
    Helper function parsing the command line options
    @retval ArgumentParser
    """
    parser = ArgumentParser(
        description=(
            "PyTorch TPU distributed training launch helper utility that will spawn up multiple distributed processes"
        )
    )

    # Optional arguments for the launch helper
    num_devices = device_count()
    parser.add_argument(
        "--num_cores",
        type=int,
        default=num_devices,
        help="Number of TPU cores to use (1 or number of available devices).",
    )

    # positional
    parser.add_argument(
        "training_script",
        type=str,
        help=(
            "The full path to the single TPU training "
            "program/script to be launched in parallel, "
            "followed by all the arguments for the "
            "training script"
        ),
    )

    # rest from the training program
    parser.add_argument("training_script_args", nargs=REMAINDER)

    return parser.parse_args()


def main():
    args = parse_args()

    # Import training_script as a module.
    script_fpath = Path(args.training_script)
    sys.path.append(str(script_fpath.parent.resolve()))
    mod_name = script_fpath.stem
    mod = importlib.import_module(mod_name)

    # Patch sys.argv
    sys.argv = [args.training_script] + args.training_script_args
    num_cores = args.num_cores
    if num_cores == device_count() and num_cores != 1:
        # There is an error in xmp.spawn that causes it to fail when num_cores is specified and not 1, so we set it to
        # None when it matches the number of devices.
        num_cores = None
    xmp.spawn(mod._mp_fn, args=(), nprocs=args.num_cores)


if __name__ == "__main__":
    main()


================================================
FILE: utils/log_reports.py
================================================
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
from datetime import date
from pathlib import Path

from tabulate import DataRow, TableFormat, tabulate


hf_table_format = TableFormat(
    lineabove=None,
    linebelowheader=None,
    linebetweenrows=None,
    linebelow=None,
    headerrow=DataRow("", "|", "|"),
    datarow=DataRow("", "|", "|"),
    padding=1,
    with_header_hide=None,
)


failed = []
group_info = []

no_error_payload = {"type": "section", "text": {"type": "plain_text", "text": "No failed tests! 🤗", "emoji": True}}

payload = [
    {
        "type": "header",
        "text": {
            "type": "plain_text",
            "text": f"🤗 Accelerate nightly {os.environ.get('TEST_TYPE', '')} test results",
            "emoji": True,
        },
    }
]

total_num_failed = 0
for log in Path().glob("*.log"):
    section_num_failed = 0
    with open(log) as f:
        for line in f:
            line = json.loads(line)
            if line.get("nodeid", "") != "":
                test = line["nodeid"]
                if line.get("duration", None) is not None:
                    duration = f"{line['duration']:.4f}"
                    if line.get("outcome", "") == "failed":
                        section_num_failed += 1
                        failed.append([test, duration, log.name.split("_")[0]])
                        total_num_failed += 1
    group_info.append([str(log), section_num_failed, failed])
    failed = []
    log.unlink()

message = ""
all_files2failed = []
if total_num_failed > 0:
    for name, num_failed, failed_tests in group_info:
        if num_failed > 0:
            if num_failed == 1:
                message += f"*{name[1:]}: {num_failed} failed test*\n"
            else:
                message += f"*{name[1:]}: {num_failed} failed tests*\n"
            failed_table = []
            files2failed = {}
            for test in failed_tests:
                data = test[0].split("::")
                data[0] = data[0].split("/")[-1]
                if data[0] not in files2failed:
                    files2failed[data[0]] = [data[1:]]
                else:
                    files2failed[data[0]] += [data[1:]]
                failed_table.append(data)

            files = [test[0] for test in failed_table]
            individual_files = list(set(files))
            # Count number of instances in failed_tests
            table = []
            for file in individual_files:
                table.append([file, len(files2failed[file])])

            failed_table = tabulate(
                table,
                headers=["Test Location", "Num Failed"],
                tablefmt=hf_table_format,
                stralign="right",
            )
            message += f"\n```\n{failed_table}\n```"
            all_files2failed.append(files2failed)
    if len(message) > 3000:
        err = "Too many failed tests, please see the full report in the Action results."
        offset = len(err) + 10
        message = message[: 3000 - offset] + f"\n...\n```\n{err}"
    print(f"### {message}")
else:
    message = "No failed tests! 🤗"
    print(f"## {message}")
    payload.append(no_error_payload)

if os.environ.get("TEST_TYPE", "") != "":
    from slack_sdk import WebClient

    client = WebClient(token=os.environ["SLACK_API_TOKEN"])
    if message != "No failed tests! 🤗":
        md_report = {
            "type": "section",
            "text": {
                "type": "mrkdwn",
                "text": message,
            },
        }
        payload.append(md_report)
        action_button = {
            "type": "section",
            "text": {
                "type": "mrkdwn",
                "text": "*For more details:*",
            },
            "accessory": {
                "type": "button",
                "text": {
                    "type": "plain_text",
                    "text": "Check Action results",
                    "emoji": True,
                },
                "url": f"https://github.com/{os.environ['GITHUB_REPOSITORY']}/actions/runs/{os.environ['GITHUB_RUN_ID']}",
            },
        }
        payload.append(action_button)
        date_report = {
            "type": "context",
            "elements": [
                {
                    "type": "plain_text",
                    "text": f"Nightly {os.environ.get('TEST_TYPE')} test results for {date.today()}",
                }
            ],
        }
        payload.append(date_report)
    response = client.chat_postMessage(channel="#accelerate-ci-daily", text=message, blocks=payload)
    ts = response.data["ts"]
    for failed_file in all_files2failed:
        for test_location, test_failures in failed_file.items():
            # Keep only the first instance of the test name
            test_class = ""
            for i, row in enumerate(test_failures):
                if row[0] != test_class:
                    test_class = row[0]
                else:
                    test_failures[i][0] = ""

            payload = {
                "type": "section",
                "text": {
                    "type": "mrkdwn",
                    "text": f"Test location: {test_location}\n```\n{tabulate(test_failures, headers=['Class', 'Test'], tablefmt=hf_table_format, stralign='right')}\n```",
                },
            }

            client.chat_postMessage(
                channel="#accelerate-ci-daily",
                thread_ts=ts,
                blocks=[payload],
            )


================================================
FILE: utils/stale.py
================================================
# Copyright 2022 The HuggingFace Team, the AllenNLP library authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Script to close stale issue. Taken in part from the AllenNLP repository.
https://github.com/allenai/allennlp.
"""

import os
from datetime import datetime as dt
from datetime import timezone

from github import Github


LABELS_TO_EXEMPT = [
    "good first issue",
    "feature request",
    "wip",
]


def main():
    g = Github(os.environ["GITHUB_TOKEN"])
    repo = g.get_repo("huggingface/accelerate")
    open_issues = repo.get_issues(state="open")

    for issue in open_issues:
        comments = sorted([comment for comment in issue.get_comments()], key=lambda i: i.created_at, reverse=True)
        last_comment = comments[0] if len(comments) > 0 else None
        current_time = dt.now(timezone.utc)
        days_since_updated = (current_time - issue.updated_at).days
        days_since_creation = (current_time - issue.created_at).days
        if (
            last_comment is not None
            and last_comment.user.login == "github-actions[bot]"
            and days_since_updated > 7
            and days_since_creation >= 30
            and not any(label.name.lower() in LABELS_TO_EXEMPT for label in issue.get_labels())
        ):
            # Close issue since it has been 7 days of inactivity since bot mention.
            issue.edit(state="closed")
        elif (
            days_since_updated > 23
            and days_since_creation >= 30
            and not any(label.name.lower() in LABELS_TO_EXEMPT for label in issue.get_labels())
        ):
            # Add stale comment
            issue.create_comment(
                "This issue has been automatically marked as stale because it has not had "
                "recent activity. If you think this still needs to be addressed "
                "please comment on this thread.\n\nPlease note that issues that do not follow the "
                "[contributing guidelines](https://github.com/huggingface/accelerate/blob/main/CONTRIBUTING.md) "
                "are likely to be ignored."
            )


if __name__ == "__main__":
    main()