gitextract_pp0afoji/

├── .github/
│   └── workflows/
│       ├── ci.yml
│       ├── ci_gpu.yml
│       └── ci_tests.yml
├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── dev/
│   ├── cpu/
│   │   └── matmul_forward.c
│   ├── cuda/
│   │   ├── Makefile
│   │   ├── README.md
│   │   ├── adamw.cu
│   │   ├── attention_backward.cu
│   │   ├── attention_forward.cu
│   │   ├── benchmark_on_modal.py
│   │   ├── classifier_fused.cu
│   │   ├── common.h
│   │   ├── crossentropy_forward.cu
│   │   ├── crossentropy_softmax_backward.cu
│   │   ├── encoder_backward.cu
│   │   ├── encoder_forward.cu
│   │   ├── fused_residual_forward.cu
│   │   ├── gelu_backward.cu
│   │   ├── gelu_forward.cu
│   │   ├── global_norm.cu
│   │   ├── layernorm_backward.cu
│   │   ├── layernorm_forward.cu
│   │   ├── matmul_backward.cu
│   │   ├── matmul_backward_bias.cu
│   │   ├── matmul_forward.cu
│   │   ├── nccl_all_reduce.cu
│   │   ├── permute.cu
│   │   ├── residual_forward.cu
│   │   ├── softmax_forward.cu
│   │   └── trimat_forward.cu
│   ├── data/
│   │   ├── README.md
│   │   ├── data_common.py
│   │   ├── edu_fineweb.sh
│   │   ├── fineweb.py
│   │   ├── fineweb.sh
│   │   ├── hellaswag.py
│   │   ├── mmlu.py
│   │   ├── tinyshakespeare.py
│   │   └── tinystories.py
│   ├── download_starter_pack.sh
│   ├── eval/
│   │   ├── README.md
│   │   ├── export_hf.py
│   │   ├── run_eval.sh
│   │   └── summarize_eval.py
│   ├── loss_checker_ci.py
│   ├── test/
│   │   ├── Makefile
│   │   ├── device_file_io.cu
│   │   ├── test_dataloader.c
│   │   └── test_outlier_detector.c
│   ├── unistd.h
│   └── vislog.ipynb
├── doc/
│   └── layernorm/
│       ├── layernorm.c
│       ├── layernorm.md
│       └── layernorm.py
├── llmc/
│   ├── adamw.cuh
│   ├── attention.cuh
│   ├── cublas_common.h
│   ├── cuda_common.h
│   ├── cuda_utils.cuh
│   ├── cudnn_att.cpp
│   ├── cudnn_att.h
│   ├── dataloader.h
│   ├── encoder.cuh
│   ├── fused_classifier.cuh
│   ├── gelu.cuh
│   ├── global_norm.cuh
│   ├── layernorm.cuh
│   ├── logger.h
│   ├── matmul.cuh
│   ├── mfu.h
│   ├── outlier_detector.h
│   ├── rand.h
│   ├── sampler.h
│   ├── schedulers.h
│   ├── tokenizer.h
│   ├── utils.h
│   └── zero.cuh
├── profile_gpt2.cu
├── profile_gpt2cu.py
├── requirements.txt
├── scripts/
│   ├── README.md
│   ├── multi_node/
│   │   ├── run_gpt2_124M_fs.sbatch
│   │   ├── run_gpt2_124M_mpi.sh
│   │   └── run_gpt2_124M_tcp.sbatch
│   ├── pyrun_gpt2_124M.sh
│   ├── run_gpt2_124M.sh
│   ├── run_gpt2_1558M.sh
│   ├── run_gpt2_350M.sh
│   ├── run_gpt2_774M.sh
│   └── run_gpt3_125M.sh
├── test_gpt2.c
├── test_gpt2.cu
├── test_gpt2_fp32.cu
├── train_gpt2.c
├── train_gpt2.cu
├── train_gpt2.py
├── train_gpt2_fp32.cu
└── train_llama3.py