gitextract_lov69rop/

├── .gitignore
├── LICENSE-CC-BY-SA
├── Makefile
├── README.md
├── build/
│   ├── README.md
│   ├── linkcheckerrc
│   ├── mdbook/
│   │   ├── md-to-html.py
│   │   ├── mv-links.py
│   │   ├── preprocess-html-for-epub.py
│   │   └── utils/
│   │       ├── build_utils.py
│   │       └── github_md_utils.py
│   ├── prince_style.css
│   └── requirements.txt
├── chapters-md.txt
├── compute/
│   ├── README.md
│   ├── accelerator/
│   │   ├── README.md
│   │   ├── amd/
│   │   │   ├── debug.md
│   │   │   └── performance.md
│   │   ├── benchmarks/
│   │   │   ├── README.md
│   │   │   └── mamf-finder.py
│   │   └── nvidia/
│   │       └── debug.md
│   ├── cpu/
│   │   └── README.md
│   └── cpu-memory/
│       └── README.md
├── contributors.md
├── debug/
│   ├── NicerTrace.py
│   ├── README.md
│   ├── make-tiny-models-tokenizers-datasets.md
│   ├── nccl-performance-debug.md
│   ├── pytorch.md
│   ├── tiny-scripts/
│   │   ├── README.md
│   │   ├── c4-en-10k.py
│   │   ├── cm4-synthetic-testing.py
│   │   ├── fsmt-make-super-tiny-model.py
│   │   ├── general-pmd-ds-unpack.py
│   │   ├── general-pmd-synthetic-testing.py
│   │   ├── idefics-make-tiny-model.py
│   │   ├── m4-ds-unpack.py
│   │   ├── mt5-make-tiny-model.py
│   │   ├── openwebtext-10k.py
│   │   └── oscar-en-10k.py
│   ├── tools.md
│   ├── torch-distributed-gpu-test.py
│   ├── torch-distributed-hanging-solutions.md
│   ├── underflow_overflow.md
│   └── underflow_overflow.py
├── inference/
│   └── README.md
├── insights/
│   ├── ai-battlefield.md
│   └── how-to-choose-cloud-provider.md
├── model-parallelism/
│   └── README.md
├── network/
│   ├── README.md
│   ├── benchmarks/
│   │   ├── README.md
│   │   ├── all_gather_object_vs_all_gather.py
│   │   ├── all_gather_object_vs_all_reduce.py
│   │   ├── all_reduce_bench.py
│   │   ├── all_reduce_bench_pyxis.sbatch
│   │   ├── all_reduce_latency_comp.py
│   │   └── results/
│   │       ├── README.md
│   │       └── disable-nvlink.md
│   ├── comms.md
│   └── debug/
│       └── README.md
├── orchestration/
│   ├── README.md
│   ├── kubernetes/
│   │   └── README.md
│   └── slurm/
│       ├── README.md
│       ├── admin.md
│       ├── cron-daily.slurm
│       ├── cron-hourly.slurm
│       ├── example.slurm
│       ├── launchers/
│       │   ├── README.md
│       │   ├── accelerate-launcher.slurm
│       │   ├── lightning-launcher.slurm
│       │   ├── srun-launcher.slurm
│       │   └── torchrun-launcher.slurm
│       ├── performance.md
│       ├── undrain-good-nodes.sh
│       └── users.md
├── resources/
│   └── README.md
├── stabs/
│   ├── README.md
│   └── incoming.md
├── storage/
│   ├── README.md
│   ├── benchmarks/
│   │   └── results/
│   │       └── hope-2023-12-20-14-37-02-331702-summary.md
│   ├── fio-json-extract.py
│   └── fio-scan
├── testing/
│   ├── README.md
│   └── testing_utils.py
├── todo.md
└── training/
    ├── README.md
    ├── checkpoints/
    │   ├── README.md
    │   ├── torch-checkpoint-convert-to-bf16
    │   └── torch-checkpoint-shrink.py
    ├── datasets.md
    ├── dtype.md
    ├── emulate-multi-node.md
    ├── fault-tolerance/
    │   ├── README.md
    │   ├── fs-watchdog.py
    │   ├── fs-watchdog.slurm
    │   ├── slurm-status.py
    │   └── slurm-status.slurm
    ├── hparams.md
    ├── instabilities/
    │   ├── README.md
    │   └── training-loss-patterns.md
    ├── model-parallelism/
    │   └── README.md
    ├── performance/
    │   ├── README.md
    │   ├── benchmarks/
    │   │   ├── activation-memory-per-layer.py
    │   │   ├── dataloader/
    │   │   │   ├── num-workers-bench.py
    │   │   │   └── pin-memory-non-block-bench.py
    │   │   ├── matrix-shape/
    │   │   │   └── swiglu-maf-bench.py
    │   │   └── numa/
    │   │       ├── numa-set-pynvml.py
    │   │       └── numa-set.sh
    │   └── distributed/
    │       └── torch-dist-mem-usage.py
    ├── re-train-hub-models.md
    ├── reproducibility/
    │   └── README.md
    └── tools/
        ├── main_process_first.py
        ├── multi-gpu-non-interleaved-print.py
        └── printflock.py