Full Code of FMInference/FlexLLMGen for AI

main 004ffef82b46 cached

3433 files

48.1 MB

12.8M tokens

42052 symbols

1 requests

Copy disabled (too large) Download .txt

Showing preview only (51,266K chars total). Download the full file to get everything.

Repository: FMInference/FlexLLMGen
Branch: main
Commit: 004ffef82b46
Files: 3433
Total size: 48.1 MB

Directory structure:
gitextract_3q2i3t76/

├── .gitignore
├── LICENSE
├── README.md
├── benchmark/
│   ├── batch_size_table.md
│   ├── flexgen/
│   │   └── bench_scan_175b.sh
│   ├── flexllmgen/
│   │   ├── README.md
│   │   ├── bench_175b_1x4.sh
│   │   ├── bench_175b_4x1.sh
│   │   ├── bench_30b_1x4.sh
│   │   ├── bench_30b_4x1.sh
│   │   ├── bench_6.7b_1x4.sh
│   │   ├── bench_6.7b_4x1.sh
│   │   ├── bench_dist_multi_node.sh
│   │   ├── bench_dist_single_node.sh
│   │   └── bench_suite.py
│   ├── hf_ds/
│   │   ├── README.md
│   │   ├── bench_all_1x4.sh
│   │   ├── bench_ds_175b_4x1.sh
│   │   ├── bench_ds_30b_1x4.sh
│   │   ├── bench_ds_30b_4x1.sh
│   │   ├── bench_ds_6.7b_1x4.sh
│   │   ├── bench_ds_6.7b_2x1.sh
│   │   ├── bench_ds_6.7b_4x1.sh
│   │   ├── bench_hf.py
│   │   ├── hf_opt.py
│   │   └── hostfile
│   ├── petals/
│   │   ├── README.md
│   │   └── run_opt_requests.py
│   └── third_party/
│       ├── DeepSpeed/
│       │   ├── .clang-format
│       │   ├── .github/
│       │   │   ├── ISSUE_TEMPLATE/
│       │   │   │   ├── compression_bug_report.md
│       │   │   │   ├── feature_request.md
│       │   │   │   ├── inference_bug_report.md
│       │   │   │   └── training_bug_report.md
│       │   │   └── workflows/
│       │   │       ├── amd.yml
│       │   │       ├── formatting.yml
│       │   │       ├── nv-accelerate-v100.yml
│       │   │       ├── nv-inference.yml
│       │   │       ├── nv-lightning-v100.yml
│       │   │       ├── nv-mii.yml
│       │   │       ├── nv-nightly.yml
│       │   │       ├── nv-torch-latest-v100.yml
│       │   │       ├── nv-torch-nightly-v100.yml
│       │   │       ├── nv-torch18-p40.yml
│       │   │       ├── nv-torch18-v100.yml
│       │   │       ├── nv-transformers-v100.yml
│       │   │       ├── pre-compile-ops.yml
│       │   │       └── python.yml
│       │   ├── .gitignore
│       │   ├── .pre-commit-config.yaml
│       │   ├── .pylintrc
│       │   ├── .readthedocs.yml
│       │   ├── .style.yapf
│       │   ├── CODEOWNERS
│       │   ├── CODE_OF_CONDUCT.md
│       │   ├── CONTRIBUTING.md
│       │   ├── LICENSE
│       │   ├── MANIFEST.in
│       │   ├── MANIFEST_win.in
│       │   ├── README.md
│       │   ├── SECURITY.md
│       │   ├── azure/
│       │   │   └── README.md
│       │   ├── benchmarks/
│       │   │   ├── __init__.py
│       │   │   ├── communication/
│       │   │   │   ├── README.md
│       │   │   │   ├── __init__.py
│       │   │   │   ├── all_gather.py
│       │   │   │   ├── all_reduce.py
│       │   │   │   ├── all_to_all.py
│       │   │   │   ├── broadcast.py
│       │   │   │   ├── constants.py
│       │   │   │   ├── pt2pt.py
│       │   │   │   ├── run_all.py
│       │   │   │   └── utils.py
│       │   │   └── inference/
│       │   │       ├── bert-bench.py
│       │   │       ├── collect_results.py
│       │   │       ├── gpt-bench.py
│       │   │       ├── requirements.txt
│       │   │       ├── run_model.sh
│       │   │       └── sweep.sh
│       │   ├── bin/
│       │   │   ├── ds
│       │   │   ├── ds_bench
│       │   │   ├── ds_elastic
│       │   │   ├── ds_report
│       │   │   └── ds_ssh
│       │   ├── build_win.bat
│       │   ├── csrc/
│       │   │   ├── adagrad/
│       │   │   │   └── cpu_adagrad.cpp
│       │   │   ├── adam/
│       │   │   │   ├── cpu_adam.cpp
│       │   │   │   ├── fused_adam_frontend.cpp
│       │   │   │   ├── multi_tensor_adam.cu
│       │   │   │   └── multi_tensor_apply.cuh
│       │   │   ├── aio/
│       │   │   │   ├── common/
│       │   │   │   │   ├── deepspeed_aio_common.cpp
│       │   │   │   │   ├── deepspeed_aio_common.h
│       │   │   │   │   ├── deepspeed_aio_types.cpp
│       │   │   │   │   ├── deepspeed_aio_types.h
│       │   │   │   │   ├── deepspeed_aio_utils.cpp
│       │   │   │   │   └── deepspeed_aio_utils.h
│       │   │   │   ├── py_lib/
│       │   │   │   │   ├── deepspeed_aio_thread.cpp
│       │   │   │   │   ├── deepspeed_aio_thread.h
│       │   │   │   │   ├── deepspeed_py_aio.cpp
│       │   │   │   │   ├── deepspeed_py_aio.h
│       │   │   │   │   ├── deepspeed_py_aio_handle.cpp
│       │   │   │   │   ├── deepspeed_py_aio_handle.h
│       │   │   │   │   ├── deepspeed_py_copy.cpp
│       │   │   │   │   ├── deepspeed_py_copy.h
│       │   │   │   │   └── py_ds_aio.cpp
│       │   │   │   └── py_test/
│       │   │   │       ├── aio_bench_generate_param.py
│       │   │   │       ├── aio_bench_perf_sweep.py
│       │   │   │       ├── ds_aio_basic.py
│       │   │   │       ├── ds_aio_handle.py
│       │   │   │       ├── parse_aio_stats.py
│       │   │   │       ├── perf_sweep_utils.py
│       │   │   │       ├── run_read_sweep.sh
│       │   │   │       ├── run_write_sweep.sh
│       │   │   │       ├── single_process_config.json
│       │   │   │       ├── test_ds_aio.py
│       │   │   │       ├── test_ds_aio_utils.py
│       │   │   │       └── validate_async_io.py
│       │   │   ├── common/
│       │   │   │   └── custom_cuda_kernel.cu
│       │   │   ├── includes/
│       │   │   │   ├── StopWatch.h
│       │   │   │   ├── Timer.h
│       │   │   │   ├── compat.h
│       │   │   │   ├── context.h
│       │   │   │   ├── conversion_utils.h
│       │   │   │   ├── cpu_adagrad.h
│       │   │   │   ├── cpu_adam.h
│       │   │   │   ├── cublas_wrappers.h
│       │   │   │   ├── custom_cuda_layers.h
│       │   │   │   ├── dequantization_utils.h
│       │   │   │   ├── dropout.h
│       │   │   │   ├── ds_kernel_utils.h
│       │   │   │   ├── ds_transformer_cuda.h
│       │   │   │   ├── feed_forward.h
│       │   │   │   ├── gelu.h
│       │   │   │   ├── gemm_test.h
│       │   │   │   ├── general_kernels.h
│       │   │   │   ├── memory_access_utils.h
│       │   │   │   ├── normalize_layer.h
│       │   │   │   ├── quantization.h
│       │   │   │   ├── quantization_utils.h
│       │   │   │   ├── quantizer.h
│       │   │   │   ├── reduction_utils.h
│       │   │   │   ├── simd.h
│       │   │   │   ├── softmax.h
│       │   │   │   ├── strided_batch_gemm.h
│       │   │   │   └── type_shim.h
│       │   │   ├── lamb/
│       │   │   │   ├── fused_lamb_cuda.cpp
│       │   │   │   └── fused_lamb_cuda_kernel.cu
│       │   │   ├── quantization/
│       │   │   │   ├── dequantize.cu
│       │   │   │   ├── fake_quantizer.cu
│       │   │   │   ├── pt_binding.cpp
│       │   │   │   └── quantize.cu
│       │   │   ├── sparse_attention/
│       │   │   │   └── utils.cpp
│       │   │   ├── spatial/
│       │   │   │   ├── csrc/
│       │   │   │   │   ├── opt_bias_add.cu
│       │   │   │   │   └── pt_binding.cpp
│       │   │   │   └── includes/
│       │   │   │       └── spatial_cuda_layers.h
│       │   │   ├── transformer/
│       │   │   │   ├── cublas_wrappers.cu
│       │   │   │   ├── dropout_kernels.cu
│       │   │   │   ├── ds_transformer_cuda.cpp
│       │   │   │   ├── gelu_kernels.cu
│       │   │   │   ├── general_kernels.cu
│       │   │   │   ├── inference/
│       │   │   │   │   ├── csrc/
│       │   │   │   │   │   ├── apply_rotary_pos_emb.cu
│       │   │   │   │   │   ├── dequantize.cu
│       │   │   │   │   │   ├── gelu.cu
│       │   │   │   │   │   ├── layer_norm.cu
│       │   │   │   │   │   ├── pt_binding.cpp
│       │   │   │   │   │   ├── relu.cu
│       │   │   │   │   │   ├── softmax.cu
│       │   │   │   │   │   └── transform.cu
│       │   │   │   │   └── includes/
│       │   │   │   │       ├── inference_context.h
│       │   │   │   │       ├── inference_cublas_wrappers.h
│       │   │   │   │       └── inference_cuda_layers.h
│       │   │   │   ├── normalize_kernels.cu
│       │   │   │   ├── softmax_kernels.cu
│       │   │   │   └── transform_kernels.cu
│       │   │   └── utils/
│       │   │       └── flatten_unflatten.cpp
│       │   ├── deepspeed/
│       │   │   ├── __init__.py
│       │   │   ├── accelerator/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── abstract_accelerator.py
│       │   │   │   ├── cuda_accelerator.py
│       │   │   │   └── real_accelerator.py
│       │   │   ├── autotuning/
│       │   │   │   ├── .gitignore
│       │   │   │   ├── README.md
│       │   │   │   ├── __init__.py
│       │   │   │   ├── autotuner.py
│       │   │   │   ├── config.py
│       │   │   │   ├── config_templates/
│       │   │   │   │   ├── template_zero0.json
│       │   │   │   │   ├── template_zero1.json
│       │   │   │   │   ├── template_zero2.json
│       │   │   │   │   └── template_zero3.json
│       │   │   │   ├── constants.py
│       │   │   │   ├── scheduler.py
│       │   │   │   ├── tuner/
│       │   │   │   │   ├── README.md
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── base_tuner.py
│       │   │   │   │   ├── cost_model.py
│       │   │   │   │   ├── index_based_tuner.py
│       │   │   │   │   ├── model_based_tuner.py
│       │   │   │   │   └── utils.py
│       │   │   │   └── utils.py
│       │   │   ├── checkpoint/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── constants.py
│       │   │   │   ├── deepspeed_checkpoint.py
│       │   │   │   ├── reshape_3d_utils.py
│       │   │   │   ├── reshape_meg_2d.py
│       │   │   │   ├── reshape_utils.py
│       │   │   │   ├── universal_checkpoint.py
│       │   │   │   ├── utils.py
│       │   │   │   └── zero_checkpoint.py
│       │   │   ├── comm/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── backend.py
│       │   │   │   ├── comm.py
│       │   │   │   ├── config.py
│       │   │   │   ├── constants.py
│       │   │   │   ├── torch.py
│       │   │   │   └── utils.py
│       │   │   ├── compression/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── basic_layer.py
│       │   │   │   ├── compress.py
│       │   │   │   ├── config.py
│       │   │   │   ├── constants.py
│       │   │   │   ├── helper.py
│       │   │   │   ├── scheduler.py
│       │   │   │   └── utils.py
│       │   │   ├── constants.py
│       │   │   ├── elasticity/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── config.py
│       │   │   │   ├── constants.py
│       │   │   │   ├── elastic_agent.py
│       │   │   │   ├── elasticity.py
│       │   │   │   └── utils.py
│       │   │   ├── env_report.py
│       │   │   ├── git_version_info.py
│       │   │   ├── inference/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── config.py
│       │   │   │   └── engine.py
│       │   │   ├── launcher/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── constants.py
│       │   │   │   ├── launch.py
│       │   │   │   ├── multinode_runner.py
│       │   │   │   └── runner.py
│       │   │   ├── model_implementations/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── diffusers/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── unet.py
│       │   │   │   │   └── vae.py
│       │   │   │   └── transformers/
│       │   │   │       ├── __init__.py
│       │   │   │       ├── clip_encoder.py
│       │   │   │       └── ds_transformer.py
│       │   │   ├── module_inject/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── inject.py
│       │   │   │   ├── layers.py
│       │   │   │   ├── load_checkpoint.py
│       │   │   │   ├── module_quantize.py
│       │   │   │   ├── replace_module.py
│       │   │   │   └── replace_policy.py
│       │   │   ├── moe/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── experts.py
│       │   │   │   ├── layer.py
│       │   │   │   ├── mappings.py
│       │   │   │   ├── sharded_moe.py
│       │   │   │   └── utils.py
│       │   │   ├── monitor/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── config.py
│       │   │   │   ├── constants.py
│       │   │   │   ├── csv_monitor.py
│       │   │   │   ├── monitor.py
│       │   │   │   ├── tensorboard.py
│       │   │   │   ├── utils.py
│       │   │   │   └── wandb.py
│       │   │   ├── nebula/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── config.py
│       │   │   │   └── constants.py
│       │   │   ├── ops/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── adagrad/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   └── cpu_adagrad.py
│       │   │   │   ├── adam/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── cpu_adam.py
│       │   │   │   │   ├── fused_adam.py
│       │   │   │   │   └── multi_tensor_apply.py
│       │   │   │   ├── aio/
│       │   │   │   │   └── __init__.py
│       │   │   │   ├── lamb/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   └── fused_lamb.py
│       │   │   │   ├── quantizer/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   └── quantizer.py
│       │   │   │   ├── sparse_attention/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── bert_sparse_self_attention.py
│       │   │   │   │   ├── matmul.py
│       │   │   │   │   ├── softmax.py
│       │   │   │   │   ├── sparse_attention_utils.py
│       │   │   │   │   ├── sparse_self_attention.py
│       │   │   │   │   ├── sparsity_config.py
│       │   │   │   │   └── trsrc/
│       │   │   │   │       ├── __init__.py
│       │   │   │   │       ├── matmul.tr
│       │   │   │   │       ├── softmax_bwd.tr
│       │   │   │   │       └── softmax_fwd.tr
│       │   │   │   └── transformer/
│       │   │   │       ├── __init__.py
│       │   │   │       ├── inference/
│       │   │   │       │   ├── __init__.py
│       │   │   │       │   ├── bias_add.py
│       │   │   │       │   ├── config.py
│       │   │   │       │   ├── diffusers_2d_transformer.py
│       │   │   │       │   ├── diffusers_attention.py
│       │   │   │       │   ├── diffusers_transformer_block.py
│       │   │   │       │   ├── ds_attention.py
│       │   │   │       │   ├── ds_mlp.py
│       │   │   │       │   ├── moe_inference.py
│       │   │   │       │   └── triton_ops.py
│       │   │   │       └── transformer.py
│       │   │   ├── pipe/
│       │   │   │   └── __init__.py
│       │   │   ├── profiling/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── config.py
│       │   │   │   ├── constants.py
│       │   │   │   └── flops_profiler/
│       │   │   │       ├── README.md
│       │   │   │       ├── __init__.py
│       │   │   │       └── profiler.py
│       │   │   ├── runtime/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── activation_checkpointing/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── checkpointing.py
│       │   │   │   │   └── config.py
│       │   │   │   ├── bf16_optimizer.py
│       │   │   │   ├── checkpoint_engine/
│       │   │   │   │   ├── README.md
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── checkpoint_engine.py
│       │   │   │   │   ├── nebula_checkpoint_engine.py
│       │   │   │   │   └── torch_checkpoint_engine.py
│       │   │   │   ├── comm/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── coalesced_collectives.py
│       │   │   │   │   ├── mpi.py
│       │   │   │   │   └── nccl.py
│       │   │   │   ├── compression/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   └── cupy.py
│       │   │   │   ├── config.py
│       │   │   │   ├── config_utils.py
│       │   │   │   ├── constants.py
│       │   │   │   ├── data_pipeline/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   └── curriculum_scheduler.py
│       │   │   │   ├── dataloader.py
│       │   │   │   ├── eigenvalue.py
│       │   │   │   ├── engine.py
│       │   │   │   ├── fp16/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── fused_optimizer.py
│       │   │   │   │   ├── loss_scaler.py
│       │   │   │   │   ├── onebit/
│       │   │   │   │   │   ├── __init__.py
│       │   │   │   │   │   ├── adam.py
│       │   │   │   │   │   ├── lamb.py
│       │   │   │   │   │   └── zoadam.py
│       │   │   │   │   └── unfused_optimizer.py
│       │   │   │   ├── lr_schedules.py
│       │   │   │   ├── pipe/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── engine.py
│       │   │   │   │   ├── module.py
│       │   │   │   │   ├── p2p.py
│       │   │   │   │   ├── schedule.py
│       │   │   │   │   └── topology.py
│       │   │   │   ├── progressive_layer_drop.py
│       │   │   │   ├── quantize.py
│       │   │   │   ├── sparse_tensor.py
│       │   │   │   ├── state_dict_factory.py
│       │   │   │   ├── swap_tensor/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── aio_config.py
│       │   │   │   │   ├── async_swapper.py
│       │   │   │   │   ├── constants.py
│       │   │   │   │   ├── optimizer_utils.py
│       │   │   │   │   ├── partitioned_optimizer_swapper.py
│       │   │   │   │   ├── partitioned_param_swapper.py
│       │   │   │   │   ├── pipelined_optimizer_swapper.py
│       │   │   │   │   └── utils.py
│       │   │   │   ├── utils.py
│       │   │   │   ├── weight_quantizer.py
│       │   │   │   └── zero/
│       │   │   │       ├── __init__.py
│       │   │   │       ├── config.py
│       │   │   │       ├── contiguous_memory_allocator.py
│       │   │   │       ├── linear.py
│       │   │   │       ├── offload_config.py
│       │   │   │       ├── parameter_offload.py
│       │   │   │       ├── partition_parameters.py
│       │   │   │       ├── partitioned_param_coordinator.py
│       │   │   │       ├── stage3.py
│       │   │   │       ├── stage_1_and_2.py
│       │   │   │       ├── test.py
│       │   │   │       ├── tiling.py
│       │   │   │       └── utils.py
│       │   │   └── utils/
│       │   │       ├── __init__.py
│       │   │       ├── comms_logging.py
│       │   │       ├── debug.py
│       │   │       ├── exceptions.py
│       │   │       ├── groups.py
│       │   │       ├── init_on_device.py
│       │   │       ├── logging.py
│       │   │       ├── mixed_precision_linkage.py
│       │   │       ├── nvtx.py
│       │   │       ├── tensor_fragment.py
│       │   │       ├── timer.py
│       │   │       ├── types.py
│       │   │       └── zero_to_fp32.py
│       │   ├── docker/
│       │   │   ├── Dockerfile
│       │   │   └── Dockerfile.rocm
│       │   ├── docs/
│       │   │   ├── 404.html
│       │   │   ├── CNAME
│       │   │   ├── Gemfile
│       │   │   ├── README.md
│       │   │   ├── _config.yml
│       │   │   ├── _data/
│       │   │   │   └── navigation.yml
│       │   │   ├── _includes/
│       │   │   │   ├── analytics.html
│       │   │   │   ├── archive-single.html
│       │   │   │   ├── author-profile-custom-links.html
│       │   │   │   ├── author-profile.html
│       │   │   │   ├── breadcrumbs.html
│       │   │   │   ├── browser-upgrade.html
│       │   │   │   ├── category-list.html
│       │   │   │   ├── comment.html
│       │   │   │   ├── comments.html
│       │   │   │   ├── documents-collection.html
│       │   │   │   ├── feature_row
│       │   │   │   ├── figure
│       │   │   │   ├── footer.html
│       │   │   │   ├── gallery
│       │   │   │   ├── group-by-array
│       │   │   │   ├── head.html
│       │   │   │   ├── masthead.html
│       │   │   │   ├── nav_list
│       │   │   │   ├── page__date.html
│       │   │   │   ├── page__hero.html
│       │   │   │   ├── page__hero_video.html
│       │   │   │   ├── page__meta.html
│       │   │   │   ├── page__taxonomy.html
│       │   │   │   ├── paginator.html
│       │   │   │   ├── post_pagination.html
│       │   │   │   ├── posts-category.html
│       │   │   │   ├── posts-tag.html
│       │   │   │   ├── scripts.html
│       │   │   │   ├── seo.html
│       │   │   │   ├── sidebar.html
│       │   │   │   ├── skip-links.html
│       │   │   │   ├── social-share.html
│       │   │   │   ├── tag-list.html
│       │   │   │   ├── toc
│       │   │   │   ├── toc.html
│       │   │   │   └── video
│       │   │   ├── _layouts/
│       │   │   │   └── single-full.html
│       │   │   ├── _pages/
│       │   │   │   ├── compression.md
│       │   │   │   ├── config-json.md
│       │   │   │   ├── inference.md
│       │   │   │   ├── posts-landing.md
│       │   │   │   ├── posts_list_landing.md
│       │   │   │   ├── training.md
│       │   │   │   └── tutorials-landing.md
│       │   │   ├── _posts/
│       │   │   │   ├── 2020-02-13-release.md
│       │   │   │   ├── 2020-02-13-turing-nlg.md
│       │   │   │   ├── 2020-03-17-reduce-scatter.md
│       │   │   │   ├── 2020-05-19-bert-record.md
│       │   │   │   ├── 2020-05-19-press-release.md
│       │   │   │   ├── 2020-05-19-zero-stage2.md
│       │   │   │   ├── 2020-05-28-fastest-bert-training.md
│       │   │   │   ├── 2020-07-24-deepspeed-webinar.md
│       │   │   │   ├── 2020-08-07-webinar-on-demand.md
│       │   │   │   ├── 2020-09-08-sparse-attention-news.md
│       │   │   │   ├── 2020-09-09-ZeRO-Offload.md
│       │   │   │   ├── 2020-09-09-onebit-adam-blog-post.md
│       │   │   │   ├── 2020-09-09-onebit-adam-news.md
│       │   │   │   ├── 2020-09-09-pipeline-parallelism.md
│       │   │   │   ├── 2020-09-09-sparse-attention.md
│       │   │   │   ├── 2020-10-28-progressive-layer-dropping-news.md
│       │   │   │   ├── 2021-03-08-zero3-offload.md
│       │   │   │   ├── 2021-05-05-MoQ.md
│       │   │   │   ├── 2021-05-05-inference-kernel-optimization.md
│       │   │   │   ├── 2021-05-14-inference-release.md
│       │   │   │   ├── 2021-08-18-deepspeed-moe.md
│       │   │   │   ├── 2021-11-15-autotuning.md
│       │   │   │   ├── 2021-12-09-deepspeed-moe-nlg.md
│       │   │   │   ├── 2022-01-19-moe-inference.md
│       │   │   │   ├── 2022-03-21-amd-support.md
│       │   │   │   ├── 2022-07-26-deepspeed-azure.md
│       │   │   │   ├── 2022-09-10-zero-inference.md
│       │   │   │   └── 2022-10-11-mii.md
│       │   │   ├── _sass/
│       │   │   │   ├── button-group.scss
│       │   │   │   ├── minimal-mistakes/
│       │   │   │   │   ├── _archive.scss
│       │   │   │   │   ├── _navigation.scss
│       │   │   │   │   ├── _page.scss
│       │   │   │   │   ├── _sidebar.scss
│       │   │   │   │   ├── _variables.scss
│       │   │   │   │   └── skins/
│       │   │   │   │       └── _air.scss
│       │   │   │   └── minimal-mistakes.scss
│       │   │   ├── _tutorials/
│       │   │   │   ├── MoQ-tutorial.md
│       │   │   │   ├── advanced-install.md
│       │   │   │   ├── autotuning.md
│       │   │   │   ├── azure.md
│       │   │   │   ├── bert-finetuning.md
│       │   │   │   ├── bert-pretraining.md
│       │   │   │   ├── cifar-10.md
│       │   │   │   ├── comms-logging.md
│       │   │   │   ├── curriculum-learning.md
│       │   │   │   ├── flops-profiler.md
│       │   │   │   ├── gan.md
│       │   │   │   ├── getting-started.md
│       │   │   │   ├── inference-tutorial.md
│       │   │   │   ├── large-models-w-deepspeed.md
│       │   │   │   ├── lrrt.md
│       │   │   │   ├── megatron.md
│       │   │   │   ├── mixture-of-experts-inference.md
│       │   │   │   ├── mixture-of-experts-nlg.md
│       │   │   │   ├── mixture-of-experts.md
│       │   │   │   ├── model-compression.md
│       │   │   │   ├── monitor.md
│       │   │   │   ├── one-cycle.md
│       │   │   │   ├── onebit-adam.md
│       │   │   │   ├── onebit-lamb.md
│       │   │   │   ├── pipeline.md
│       │   │   │   ├── progressive_layer_dropping.md
│       │   │   │   ├── pytorch-profiler.md
│       │   │   │   ├── sparse-attention.md
│       │   │   │   ├── transformer_kernel.md
│       │   │   │   ├── zero-offload.md
│       │   │   │   ├── zero-one-adam.md
│       │   │   │   └── zero.md
│       │   │   ├── assets/
│       │   │   │   └── css/
│       │   │   │       └── main.scss
│       │   │   ├── code-docs/
│       │   │   │   ├── Makefile
│       │   │   │   ├── build-api-docs.sh
│       │   │   │   └── source/
│       │   │   │       ├── activation-checkpointing.rst
│       │   │   │       ├── autotuning.rst
│       │   │   │       ├── conf.py
│       │   │   │       ├── flops-profiler.rst
│       │   │   │       ├── index.rst
│       │   │   │       ├── inference-engine.rst
│       │   │   │       ├── inference-init.rst
│       │   │   │       ├── initialize.rst
│       │   │   │       ├── kernel.rst
│       │   │   │       ├── memory.rst
│       │   │   │       ├── model-checkpointing.rst
│       │   │   │       ├── moe.rst
│       │   │   │       ├── optimizers.rst
│       │   │   │       ├── pipeline.rst
│       │   │   │       ├── schedulers.rst
│       │   │   │       ├── training.rst
│       │   │   │       └── zero3.rst
│       │   │   ├── contributing.md
│       │   │   └── index.md
│       │   ├── examples/
│       │   │   └── README.md
│       │   ├── install.sh
│       │   ├── op_builder/
│       │   │   ├── __init__.py
│       │   │   ├── all_ops.py
│       │   │   ├── async_io.py
│       │   │   ├── builder.py
│       │   │   ├── builder_names.py
│       │   │   ├── cpu_adagrad.py
│       │   │   ├── cpu_adam.py
│       │   │   ├── fused_adam.py
│       │   │   ├── fused_lamb.py
│       │   │   ├── quantizer.py
│       │   │   ├── sparse_attn.py
│       │   │   ├── spatial_inference.py
│       │   │   ├── stochastic_transformer.py
│       │   │   ├── transformer.py
│       │   │   ├── transformer_inference.py
│       │   │   └── utils.py
│       │   ├── release/
│       │   │   ├── bump_patch_version.py
│       │   │   └── release.sh
│       │   ├── requirements/
│       │   │   ├── requirements-1bit-mpi.txt
│       │   │   ├── requirements-autotuning-ml.txt
│       │   │   ├── requirements-autotuning.txt
│       │   │   ├── requirements-dev.txt
│       │   │   ├── requirements-inf.txt
│       │   │   ├── requirements-readthedocs.txt
│       │   │   ├── requirements-sd.txt
│       │   │   ├── requirements-sparse_attn.txt
│       │   │   └── requirements.txt
│       │   ├── scripts/
│       │   │   └── check-torchdist.py
│       │   ├── setup.cfg
│       │   ├── setup.py
│       │   ├── tests/
│       │   │   ├── benchmarks/
│       │   │   │   ├── flatten_bench.py
│       │   │   │   └── unflatten_bench.py
│       │   │   ├── conftest.py
│       │   │   ├── lightning/
│       │   │   │   └── test_simple.py
│       │   │   ├── model/
│       │   │   │   ├── BingBertSquad/
│       │   │   │   │   ├── BingBertSquad_run_func_test.py
│       │   │   │   │   ├── BingBertSquad_test_common.py
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── deepspeed_bsz24_fp16_config.json
│       │   │   │   │   ├── deepspeed_bsz24_fp16_eigenvalue_quantize_config.json
│       │   │   │   │   ├── deepspeed_bsz24_fp16_zero2_config.json
│       │   │   │   │   ├── deepspeed_bsz24_fp32_config.json
│       │   │   │   │   ├── run_BingBertSquad.sh
│       │   │   │   │   ├── run_BingBertSquad_sanity.sh
│       │   │   │   │   ├── run_tests.sh
│       │   │   │   │   └── test_e2e_squad.py
│       │   │   │   ├── Megatron_GPT2/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── ds_config_func_bs4_zero1.json
│       │   │   │   │   ├── ds_config_func_bs4_zero2.json
│       │   │   │   │   ├── ds_config_func_bs4_zero2_offload.json
│       │   │   │   │   ├── ds_config_func_bs8_no_zero.json
│       │   │   │   │   ├── ds_config_func_bs8_zero0_gas3.json
│       │   │   │   │   ├── ds_config_func_bs8_zero1.json
│       │   │   │   │   ├── ds_config_func_bs8_zero2.json
│       │   │   │   │   ├── ds_config_func_bs8_zero2_gas3.json
│       │   │   │   │   ├── ds_config_func_bs8_zero2_offload.json
│       │   │   │   │   ├── ds_config_func_scheduler.json
│       │   │   │   │   ├── ds_config_perf_bs16.json
│       │   │   │   │   ├── ds_config_perf_bs32.json
│       │   │   │   │   ├── ds_config_perf_bs8.json
│       │   │   │   │   ├── ds_gpt2_test.sh
│       │   │   │   │   ├── run_checkpoint_test.py
│       │   │   │   │   ├── run_func_test.py
│       │   │   │   │   ├── run_perf_baseline.py
│       │   │   │   │   ├── run_perf_test.py
│       │   │   │   │   └── test_common.py
│       │   │   │   └── run_sanity_check.py
│       │   │   ├── onebit/
│       │   │   │   ├── test_mpi_backend.py
│       │   │   │   ├── test_mpi_perf.py
│       │   │   │   ├── test_nccl_backend.py
│       │   │   │   └── test_nccl_perf.py
│       │   │   ├── perf/
│       │   │   │   ├── adam_test.py
│       │   │   │   └── adam_test1.py
│       │   │   ├── pytest.ini
│       │   │   ├── small_model_debugging/
│       │   │   │   ├── stage3_test.py
│       │   │   │   ├── test.py
│       │   │   │   └── test_model.py
│       │   │   └── unit/
│       │   │       ├── __init__.py
│       │   │       ├── alexnet_model.py
│       │   │       ├── autotuning/
│       │   │       │   └── test_autotuning.py
│       │   │       ├── checkpoint/
│       │   │       │   ├── common.py
│       │   │       │   ├── test_latest_checkpoint.py
│       │   │       │   ├── test_lr_scheduler.py
│       │   │       │   ├── test_moe_checkpoint.py
│       │   │       │   ├── test_other_optimizer.py
│       │   │       │   ├── test_pipeline.py
│       │   │       │   ├── test_reshape_checkpoint.py
│       │   │       │   ├── test_sparse.py
│       │   │       │   ├── test_tag_validation.py
│       │   │       │   └── test_zero_optimizer.py
│       │   │       ├── comm/
│       │   │       │   └── test_dist.py
│       │   │       ├── common.py
│       │   │       ├── compression/
│       │   │       │   └── test_compression.py
│       │   │       ├── ds_batch_config.json
│       │   │       ├── elasticity/
│       │   │       │   └── test_elastic.py
│       │   │       ├── gpt2-merges.txt
│       │   │       ├── gpt2-vocab.json
│       │   │       ├── inference/
│       │   │       │   ├── test_checkpoint_sharding.py
│       │   │       │   ├── test_inference.py
│       │   │       │   ├── test_inference_config.py
│       │   │       │   └── test_model_profiling.py
│       │   │       ├── launcher/
│       │   │       │   ├── test_ds_arguments.py
│       │   │       │   ├── test_multinode_runner.py
│       │   │       │   └── test_run.py
│       │   │       ├── megatron_model.py
│       │   │       ├── model_parallelism/
│       │   │       │   ├── test_configurable_parallel_mp.py
│       │   │       │   └── test_configurable_parallel_pp.py
│       │   │       ├── modeling.py
│       │   │       ├── modelingpreln.py
│       │   │       ├── moe/
│       │   │       │   ├── test_moe.py
│       │   │       │   └── test_moe_tp.py
│       │   │       ├── monitor/
│       │   │       │   └── test_monitor.py
│       │   │       ├── multi_output_model.py
│       │   │       ├── ops/
│       │   │       │   ├── adagrad/
│       │   │       │   │   └── test_cpu_adagrad.py
│       │   │       │   ├── adam/
│       │   │       │   │   ├── test_adamw.py
│       │   │       │   │   └── test_cpu_adam.py
│       │   │       │   ├── aio/
│       │   │       │   │   └── test_aio.py
│       │   │       │   ├── cuda/
│       │   │       │   │   ├── test_cuda_backward.py
│       │   │       │   │   └── test_cuda_forward.py
│       │   │       │   ├── quantizer/
│       │   │       │   │   ├── test_dequantize.py
│       │   │       │   │   ├── test_fake_quantization.py
│       │   │       │   │   └── test_quantize.py
│       │   │       │   ├── sparse_attention/
│       │   │       │   │   └── test_sparse_attention.py
│       │   │       │   ├── spatial/
│       │   │       │   │   └── test_nhwc_bias_add.py
│       │   │       │   └── transformer/
│       │   │       │       └── inference/
│       │   │       │           ├── test_bias_add.py
│       │   │       │           ├── test_bias_geglu.py
│       │   │       │           ├── test_bias_gelu.py
│       │   │       │           ├── test_bias_relu.py
│       │   │       │           ├── test_layer_norm.py
│       │   │       │           ├── test_moe_res_matmult.py
│       │   │       │           └── test_residual_add.py
│       │   │       ├── pipe/
│       │   │       │   └── test_pipe_module.py
│       │   │       ├── profiling/
│       │   │       │   └── flops_profiler/
│       │   │       │       └── test_flops_profiler.py
│       │   │       ├── runtime/
│       │   │       │   ├── activation_checkpointing/
│       │   │       │   │   └── test_activation_checkpointing.py
│       │   │       │   ├── comm/
│       │   │       │   │   └── test_coalesced_collectives.py
│       │   │       │   ├── half_precision/
│       │   │       │   │   ├── onebit/
│       │   │       │   │   │   └── test_onebit.py
│       │   │       │   │   ├── test_bf16.py
│       │   │       │   │   ├── test_dynamic_loss_scale.py
│       │   │       │   │   └── test_fp16.py
│       │   │       │   ├── pipe/
│       │   │       │   │   ├── test_pipe.py
│       │   │       │   │   ├── test_pipe_schedule.py
│       │   │       │   │   └── test_topology.py
│       │   │       │   ├── sparse_tensor/
│       │   │       │   │   ├── test_averaging_sparse_gradients.py
│       │   │       │   │   ├── test_csr.py
│       │   │       │   │   └── test_sparse_grads.py
│       │   │       │   ├── test_autocast.py
│       │   │       │   ├── test_curriculum_learning.py
│       │   │       │   ├── test_data.py
│       │   │       │   ├── test_ds_config_dict.py
│       │   │       │   ├── test_ds_config_model.py
│       │   │       │   ├── test_ds_initialize.py
│       │   │       │   ├── test_lr_schedulers.py
│       │   │       │   ├── test_multi_output_model.py
│       │   │       │   ├── test_pld.py
│       │   │       │   ├── test_runtime_utils.py
│       │   │       │   ├── utils/
│       │   │       │   │   └── test_partition.py
│       │   │       │   └── zero/
│       │   │       │       ├── test_ignore_unused_parameters.py
│       │   │       │       ├── test_zero.py
│       │   │       │       ├── test_zero_config.py
│       │   │       │       ├── test_zero_context.py
│       │   │       │       └── test_zero_tiled.py
│       │   │       ├── simple_model.py
│       │   │       ├── util.py
│       │   │       └── utils/
│       │   │           ├── test_get_optim_files.py
│       │   │           ├── test_groups.py
│       │   │           └── test_init_on_device.py
│       │   └── version.txt
│       ├── README.md
│       ├── pagecache-mangagement/
│       │   ├── .svn/
│       │   │   ├── all-wcprops
│       │   │   └── entries
│       │   ├── README.md
│       │   ├── branches/
│       │   │   └── .svn/
│       │   │       ├── all-wcprops
│       │   │       └── entries
│       │   ├── tags/
│       │   │   └── .svn/
│       │   │       ├── all-wcprops
│       │   │       └── entries
│       │   └── trunk/
│       │       ├── .svn/
│       │       │   ├── all-wcprops
│       │       │   ├── entries
│       │       │   ├── prop-base/
│       │       │   │   ├── benchmar_plain.svn-base
│       │       │   │   ├── benchmar_prepare.svn-base
│       │       │   │   ├── benchmar_qemu.svn-base
│       │       │   │   ├── benchmar_qemu2.svn-base
│       │       │   │   ├── benchmar_squashfs.svn-base
│       │       │   │   ├── benchmar_test.sh.svn-base
│       │       │   │   ├── fadv.sh.svn-base
│       │       │   │   ├── fadv_command.sh.svn-base
│       │       │   │   ├── pagecache-management-fadv.sh.svn-base
│       │       │   │   ├── pagecache-management-ignore-reads.sh.svn-base
│       │       │   │   ├── pagecache-management-lazy200.sh.svn-base
│       │       │   │   ├── pagecache-management-lazy200ir.sh.svn-base
│       │       │   │   ├── pagecache-management-null.sh.svn-base
│       │       │   │   └── pagecache-management.sh.svn-base
│       │       │   └── text-base/
│       │       │       ├── Makefile.svn-base
│       │       │       ├── benchmar_plain.svn-base
│       │       │       ├── benchmar_prepare.svn-base
│       │       │       ├── benchmar_qemu.svn-base
│       │       │       ├── benchmar_qemu2.svn-base
│       │       │       ├── benchmar_squashfs.svn-base
│       │       │       ├── benchmar_test.sh.svn-base
│       │       │       ├── fadv.c.svn-base
│       │       │       ├── fadv.sh.svn-base
│       │       │       ├── fadv_command.sh.svn-base
│       │       │       ├── pagecache-management-fadv.sh.svn-base
│       │       │       ├── pagecache-management-ignore-reads.sh.svn-base
│       │       │       ├── pagecache-management-lazy200.sh.svn-base
│       │       │       ├── pagecache-management-lazy200ir.sh.svn-base
│       │       │       ├── pagecache-management-null.sh.svn-base
│       │       │       ├── pagecache-management.c.svn-base
│       │       │       ├── pagecache-management.sh.svn-base
│       │       │       ├── pagecache-management.txt.svn-base
│       │       │       ├── sfr.c.svn-base
│       │       │       ├── sync_file_range.h.svn-base
│       │       │       └── test.c.svn-base
│       │       ├── benchmar_plain
│       │       ├── benchmar_prepare
│       │       ├── benchmar_qemu
│       │       ├── benchmar_qemu2
│       │       ├── benchmar_squashfs
│       │       ├── benchmar_test.sh
│       │       ├── fadv.c
│       │       ├── fadv.sh
│       │       ├── fadv_command.sh
│       │       ├── pagecache-management-fadv.sh
│       │       ├── pagecache-management-ignore-reads.sh
│       │       ├── pagecache-management-lazy200.sh
│       │       ├── pagecache-management-lazy200ir.sh
│       │       ├── pagecache-management-null.sh
│       │       ├── pagecache-management.c
│       │       ├── pagecache-management.sh
│       │       ├── pagecache-management.txt
│       │       ├── sfr.c
│       │       ├── sync_file_range.h
│       │       └── test.c
│       └── transformers/
│           ├── .circleci/
│           │   ├── TROUBLESHOOT.md
│           │   ├── config.yml
│           │   └── create_circleci_config.py
│           ├── .coveragerc
│           ├── .gitattributes
│           ├── .github/
│           │   ├── ISSUE_TEMPLATE/
│           │   │   ├── bug-report.yml
│           │   │   ├── config.yml
│           │   │   ├── feature-request.yml
│           │   │   ├── migration.yml
│           │   │   └── new-model-addition.yml
│           │   ├── PULL_REQUEST_TEMPLATE.md
│           │   ├── conda/
│           │   │   ├── build.sh
│           │   │   └── meta.yaml
│           │   └── workflows/
│           │       ├── TROUBLESHOOT.md
│           │       ├── add-model-like.yml
│           │       ├── build-docker-images.yml
│           │       ├── build-past-ci-docker-images.yml
│           │       ├── build_documentation.yml
│           │       ├── build_pr_documentation.yml
│           │       ├── check_runner_status.yml
│           │       ├── delete_doc_comment.yml
│           │       ├── doctests.yml
│           │       ├── model-templates.yml
│           │       ├── release-conda.yml
│           │       ├── self-nightly-scheduled.yml
│           │       ├── self-past-caller.yml
│           │       ├── self-past.yml
│           │       ├── self-push-caller.yml
│           │       ├── self-push.yml
│           │       ├── self-scheduled.yml
│           │       ├── stale.yml
│           │       └── update_metdata.yml
│           ├── .gitignore
│           ├── CITATION.cff
│           ├── CODE_OF_CONDUCT.md
│           ├── CONTRIBUTING.md
│           ├── ISSUES.md
│           ├── LICENSE
│           ├── MANIFEST.in
│           ├── Makefile
│           ├── README.md
│           ├── README_es.md
│           ├── README_ko.md
│           ├── README_zh-hans.md
│           ├── README_zh-hant.md
│           ├── conftest.py
│           ├── docker/
│           │   ├── transformers-all-latest-gpu/
│           │   │   └── Dockerfile
│           │   ├── transformers-cpu/
│           │   │   └── Dockerfile
│           │   ├── transformers-doc-builder/
│           │   │   └── Dockerfile
│           │   ├── transformers-gpu/
│           │   │   └── Dockerfile
│           │   ├── transformers-past-gpu/
│           │   │   └── Dockerfile
│           │   ├── transformers-pytorch-cpu/
│           │   │   └── Dockerfile
│           │   ├── transformers-pytorch-deepspeed-latest-gpu/
│           │   │   └── Dockerfile
│           │   ├── transformers-pytorch-deepspeed-nightly-gpu/
│           │   │   └── Dockerfile
│           │   ├── transformers-pytorch-gpu/
│           │   │   └── Dockerfile
│           │   ├── transformers-pytorch-tpu/
│           │   │   ├── Dockerfile
│           │   │   ├── bert-base-cased.jsonnet
│           │   │   ├── dataset.yaml
│           │   │   └── docker-entrypoint.sh
│           │   ├── transformers-tensorflow-cpu/
│           │   │   └── Dockerfile
│           │   └── transformers-tensorflow-gpu/
│           │       └── Dockerfile
│           ├── docs/
│           │   ├── README.md
│           │   ├── TRANSLATING.md
│           │   └── source/
│           │       ├── _config.py
│           │       ├── de/
│           │       │   ├── _config.py
│           │       │   ├── _toctree.yml
│           │       │   ├── accelerate.mdx
│           │       │   ├── autoclass_tutorial.mdx
│           │       │   ├── index.mdx
│           │       │   ├── installation.mdx
│           │       │   ├── model_sharing.mdx
│           │       │   ├── pipeline_tutorial.mdx
│           │       │   ├── preprocessing.mdx
│           │       │   ├── quicktour.mdx
│           │       │   └── training.mdx
│           │       ├── en/
│           │       │   ├── _config.py
│           │       │   ├── _toctree.yml
│           │       │   ├── accelerate.mdx
│           │       │   ├── add_new_model.mdx
│           │       │   ├── add_new_pipeline.mdx
│           │       │   ├── add_tensorflow_model.mdx
│           │       │   ├── autoclass_tutorial.mdx
│           │       │   ├── benchmarks.mdx
│           │       │   ├── bertology.mdx
│           │       │   ├── big_models.mdx
│           │       │   ├── community.mdx
│           │       │   ├── converting_tensorflow_models.mdx
│           │       │   ├── create_a_model.mdx
│           │       │   ├── custom_models.mdx
│           │       │   ├── debugging.mdx
│           │       │   ├── fast_tokenizers.mdx
│           │       │   ├── glossary.mdx
│           │       │   ├── hpo_train.mdx
│           │       │   ├── index.mdx
│           │       │   ├── installation.mdx
│           │       │   ├── internal/
│           │       │   │   ├── file_utils.mdx
│           │       │   │   ├── generation_utils.mdx
│           │       │   │   ├── image_processing_utils.mdx
│           │       │   │   ├── modeling_utils.mdx
│           │       │   │   ├── pipelines_utils.mdx
│           │       │   │   ├── tokenization_utils.mdx
│           │       │   │   └── trainer_utils.mdx
│           │       │   ├── main_classes/
│           │       │   │   ├── callback.mdx
│           │       │   │   ├── configuration.mdx
│           │       │   │   ├── data_collator.mdx
│           │       │   │   ├── deepspeed.mdx
│           │       │   │   ├── feature_extractor.mdx
│           │       │   │   ├── keras_callbacks.mdx
│           │       │   │   ├── logging.mdx
│           │       │   │   ├── model.mdx
│           │       │   │   ├── onnx.mdx
│           │       │   │   ├── optimizer_schedules.mdx
│           │       │   │   ├── output.mdx
│           │       │   │   ├── pipelines.mdx
│           │       │   │   ├── processors.mdx
│           │       │   │   ├── text_generation.mdx
│           │       │   │   ├── tokenizer.mdx
│           │       │   │   └── trainer.mdx
│           │       │   ├── migration.mdx
│           │       │   ├── model_doc/
│           │       │   │   ├── albert.mdx
│           │       │   │   ├── auto.mdx
│           │       │   │   ├── bart.mdx
│           │       │   │   ├── barthez.mdx
│           │       │   │   ├── bartpho.mdx
│           │       │   │   ├── beit.mdx
│           │       │   │   ├── bert-generation.mdx
│           │       │   │   ├── bert-japanese.mdx
│           │       │   │   ├── bert.mdx
│           │       │   │   ├── bertweet.mdx
│           │       │   │   ├── big_bird.mdx
│           │       │   │   ├── bigbird_pegasus.mdx
│           │       │   │   ├── blenderbot-small.mdx
│           │       │   │   ├── blenderbot.mdx
│           │       │   │   ├── bloom.mdx
│           │       │   │   ├── bort.mdx
│           │       │   │   ├── byt5.mdx
│           │       │   │   ├── camembert.mdx
│           │       │   │   ├── canine.mdx
│           │       │   │   ├── clip.mdx
│           │       │   │   ├── codegen.mdx
│           │       │   │   ├── conditional_detr.mdx
│           │       │   │   ├── convbert.mdx
│           │       │   │   ├── convnext.mdx
│           │       │   │   ├── cpm.mdx
│           │       │   │   ├── ctrl.mdx
│           │       │   │   ├── cvt.mdx
│           │       │   │   ├── data2vec.mdx
│           │       │   │   ├── deberta-v2.mdx
│           │       │   │   ├── deberta.mdx
│           │       │   │   ├── decision_transformer.mdx
│           │       │   │   ├── deformable_detr.mdx
│           │       │   │   ├── deit.mdx
│           │       │   │   ├── detr.mdx
│           │       │   │   ├── dialogpt.mdx
│           │       │   │   ├── distilbert.mdx
│           │       │   │   ├── dit.mdx
│           │       │   │   ├── donut.mdx
│           │       │   │   ├── dpr.mdx
│           │       │   │   ├── dpt.mdx
│           │       │   │   ├── electra.mdx
│           │       │   │   ├── encoder-decoder.mdx
│           │       │   │   ├── ernie.mdx
│           │       │   │   ├── esm.mdx
│           │       │   │   ├── flan-t5.mdx
│           │       │   │   ├── flaubert.mdx
│           │       │   │   ├── flava.mdx
│           │       │   │   ├── fnet.mdx
│           │       │   │   ├── fsmt.mdx
│           │       │   │   ├── funnel.mdx
│           │       │   │   ├── glpn.mdx
│           │       │   │   ├── gpt2.mdx
│           │       │   │   ├── gpt_neo.mdx
│           │       │   │   ├── gpt_neox.mdx
│           │       │   │   ├── gpt_neox_japanese.mdx
│           │       │   │   ├── gptj.mdx
│           │       │   │   ├── groupvit.mdx
│           │       │   │   ├── herbert.mdx
│           │       │   │   ├── hubert.mdx
│           │       │   │   ├── ibert.mdx
│           │       │   │   ├── imagegpt.mdx
│           │       │   │   ├── layoutlm.mdx
│           │       │   │   ├── layoutlmv2.mdx
│           │       │   │   ├── layoutlmv3.mdx
│           │       │   │   ├── layoutxlm.mdx
│           │       │   │   ├── led.mdx
│           │       │   │   ├── levit.mdx
│           │       │   │   ├── lilt.mdx
│           │       │   │   ├── longformer.mdx
│           │       │   │   ├── longt5.mdx
│           │       │   │   ├── luke.mdx
│           │       │   │   ├── lxmert.mdx
│           │       │   │   ├── m2m_100.mdx
│           │       │   │   ├── marian.mdx
│           │       │   │   ├── markuplm.mdx
│           │       │   │   ├── maskformer.mdx
│           │       │   │   ├── mbart.mdx
│           │       │   │   ├── mctct.mdx
│           │       │   │   ├── megatron-bert.mdx
│           │       │   │   ├── megatron_gpt2.mdx
│           │       │   │   ├── mluke.mdx
│           │       │   │   ├── mobilebert.mdx
│           │       │   │   ├── mobilevit.mdx
│           │       │   │   ├── mpnet.mdx
│           │       │   │   ├── mt5.mdx
│           │       │   │   ├── mvp.mdx
│           │       │   │   ├── nezha.mdx
│           │       │   │   ├── nllb.mdx
│           │       │   │   ├── nystromformer.mdx
│           │       │   │   ├── openai-gpt.mdx
│           │       │   │   ├── opt.mdx
│           │       │   │   ├── owlvit.mdx
│           │       │   │   ├── pegasus.mdx
│           │       │   │   ├── pegasus_x.mdx
│           │       │   │   ├── perceiver.mdx
│           │       │   │   ├── phobert.mdx
│           │       │   │   ├── plbart.mdx
│           │       │   │   ├── poolformer.mdx
│           │       │   │   ├── prophetnet.mdx
│           │       │   │   ├── qdqbert.mdx
│           │       │   │   ├── rag.mdx
│           │       │   │   ├── realm.mdx
│           │       │   │   ├── reformer.mdx
│           │       │   │   ├── regnet.mdx
│           │       │   │   ├── rembert.mdx
│           │       │   │   ├── resnet.mdx
│           │       │   │   ├── retribert.mdx
│           │       │   │   ├── roberta.mdx
│           │       │   │   ├── roformer.mdx
│           │       │   │   ├── segformer.mdx
│           │       │   │   ├── sew-d.mdx
│           │       │   │   ├── sew.mdx
│           │       │   │   ├── speech-encoder-decoder.mdx
│           │       │   │   ├── speech_to_text.mdx
│           │       │   │   ├── speech_to_text_2.mdx
│           │       │   │   ├── splinter.mdx
│           │       │   │   ├── squeezebert.mdx
│           │       │   │   ├── swin.mdx
│           │       │   │   ├── swinv2.mdx
│           │       │   │   ├── t5.mdx
│           │       │   │   ├── t5v1.1.mdx
│           │       │   │   ├── table-transformer.mdx
│           │       │   │   ├── tapas.mdx
│           │       │   │   ├── tapex.mdx
│           │       │   │   ├── time_series_transformer.mdx
│           │       │   │   ├── trajectory_transformer.mdx
│           │       │   │   ├── transfo-xl.mdx
│           │       │   │   ├── trocr.mdx
│           │       │   │   ├── ul2.mdx
│           │       │   │   ├── unispeech-sat.mdx
│           │       │   │   ├── unispeech.mdx
│           │       │   │   ├── van.mdx
│           │       │   │   ├── videomae.mdx
│           │       │   │   ├── vilt.mdx
│           │       │   │   ├── vision-encoder-decoder.mdx
│           │       │   │   ├── vision-text-dual-encoder.mdx
│           │       │   │   ├── visual_bert.mdx
│           │       │   │   ├── vit.mdx
│           │       │   │   ├── vit_mae.mdx
│           │       │   │   ├── vit_msn.mdx
│           │       │   │   ├── wav2vec2-conformer.mdx
│           │       │   │   ├── wav2vec2.mdx
│           │       │   │   ├── wav2vec2_phoneme.mdx
│           │       │   │   ├── wavlm.mdx
│           │       │   │   ├── whisper.mdx
│           │       │   │   ├── xclip.mdx
│           │       │   │   ├── xglm.mdx
│           │       │   │   ├── xlm-prophetnet.mdx
│           │       │   │   ├── xlm-roberta-xl.mdx
│           │       │   │   ├── xlm-roberta.mdx
│           │       │   │   ├── xlm.mdx
│           │       │   │   ├── xlnet.mdx
│           │       │   │   ├── xls_r.mdx
│           │       │   │   ├── xlsr_wav2vec2.mdx
│           │       │   │   ├── yolos.mdx
│           │       │   │   └── yoso.mdx
│           │       │   ├── model_sharing.mdx
│           │       │   ├── model_summary.mdx
│           │       │   ├── multilingual.mdx
│           │       │   ├── pad_truncation.mdx
│           │       │   ├── perf_hardware.mdx
│           │       │   ├── perf_infer_cpu.mdx
│           │       │   ├── perf_infer_gpu_many.mdx
│           │       │   ├── perf_infer_gpu_one.mdx
│           │       │   ├── perf_infer_special.mdx
│           │       │   ├── perf_train_cpu.mdx
│           │       │   ├── perf_train_cpu_many.mdx
│           │       │   ├── perf_train_gpu_many.mdx
│           │       │   ├── perf_train_gpu_one.mdx
│           │       │   ├── perf_train_special.mdx
│           │       │   ├── perf_train_tpu.mdx
│           │       │   ├── performance.mdx
│           │       │   ├── perplexity.mdx
│           │       │   ├── philosophy.mdx
│           │       │   ├── pipeline_tutorial.mdx
│           │       │   ├── pr_checks.mdx
│           │       │   ├── preprocessing.mdx
│           │       │   ├── quicktour.mdx
│           │       │   ├── run_scripts.mdx
│           │       │   ├── sagemaker.mdx
│           │       │   ├── serialization.mdx
│           │       │   ├── task_summary.mdx
│           │       │   ├── tasks/
│           │       │   │   ├── asr.mdx
│           │       │   │   ├── audio_classification.mdx
│           │       │   │   ├── image_classification.mdx
│           │       │   │   ├── language_modeling.mdx
│           │       │   │   ├── multiple_choice.mdx
│           │       │   │   ├── question_answering.mdx
│           │       │   │   ├── semantic_segmentation.mdx
│           │       │   │   ├── sequence_classification.mdx
│           │       │   │   ├── summarization.mdx
│           │       │   │   ├── token_classification.mdx
│           │       │   │   └── translation.mdx
│           │       │   ├── testing.mdx
│           │       │   ├── tokenizer_summary.mdx
│           │       │   ├── torchscript.mdx
│           │       │   ├── training.mdx
│           │       │   └── troubleshooting.mdx
│           │       ├── es/
│           │       │   ├── _config.py
│           │       │   ├── _toctree.yml
│           │       │   ├── accelerate.mdx
│           │       │   ├── autoclass_tutorial.mdx
│           │       │   ├── bertology.mdx
│           │       │   ├── converting_tensorflow_models.mdx
│           │       │   ├── create_a_model.mdx
│           │       │   ├── custom_models.mdx
│           │       │   ├── fast_tokenizers.mdx
│           │       │   ├── index.mdx
│           │       │   ├── installation.mdx
│           │       │   ├── model_sharing.mdx
│           │       │   ├── multilingual.mdx
│           │       │   ├── philosophy.mdx
│           │       │   ├── pipeline_tutorial.mdx
│           │       │   ├── preprocessing.mdx
│           │       │   ├── quicktour.mdx
│           │       │   ├── run_scripts.mdx
│           │       │   ├── sagemaker.mdx
│           │       │   ├── tasks/
│           │       │   │   ├── image_classification.mdx
│           │       │   │   ├── language_modeling.mdx
│           │       │   │   ├── multiple_choice.mdx
│           │       │   │   ├── question_answering.mdx
│           │       │   │   └── summarization.mdx
│           │       │   └── training.mdx
│           │       ├── it/
│           │       │   ├── _config.py
│           │       │   ├── _toctree.yml
│           │       │   ├── accelerate.mdx
│           │       │   ├── add_new_model.mdx
│           │       │   ├── add_new_pipeline.mdx
│           │       │   ├── autoclass_tutorial.mdx
│           │       │   ├── converting_tensorflow_models.mdx
│           │       │   ├── create_a_model.mdx
│           │       │   ├── custom_models.mdx
│           │       │   ├── debugging.mdx
│           │       │   ├── index.mdx
│           │       │   ├── installation.mdx
│           │       │   ├── model_sharing.mdx
│           │       │   ├── multilingual.mdx
│           │       │   ├── perf_hardware.mdx
│           │       │   ├── pipeline_tutorial.mdx
│           │       │   ├── preprocessing.mdx
│           │       │   ├── quicktour.mdx
│           │       │   ├── run_scripts.mdx
│           │       │   ├── serialization.mdx
│           │       │   └── training.mdx
│           │       └── pt/
│           │           ├── _config.py
│           │           ├── _toctree.yml
│           │           ├── accelerate.mdx
│           │           ├── converting_tensorflow_models.mdx
│           │           ├── create_a_model.mdx
│           │           ├── custom_models.mdx
│           │           ├── fast_tokenizers.mdx
│           │           ├── index.mdx
│           │           ├── installation.mdx
│           │           ├── multilingual.mdx
│           │           ├── pipeline_tutorial.mdx
│           │           ├── quicktour.mdx
│           │           ├── run_scripts.mdx
│           │           ├── serialization.mdx
│           │           ├── tasks/
│           │           │   ├── sequence_classification.mdx
│           │           │   └── token_classification.mdx
│           │           └── training.mdx
│           ├── examples/
│           │   ├── README.md
│           │   ├── flax/
│           │   │   ├── README.md
│           │   │   ├── _tests_requirements.txt
│           │   │   ├── conftest.py
│           │   │   ├── image-captioning/
│           │   │   │   ├── README.md
│           │   │   │   ├── create_model_from_encoder_decoder_models.py
│           │   │   │   └── run_image_captioning_flax.py
│           │   │   ├── language-modeling/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_bart_dlm_flax.py
│           │   │   │   ├── run_clm_flax.py
│           │   │   │   ├── run_mlm_flax.py
│           │   │   │   ├── run_t5_mlm_flax.py
│           │   │   │   └── t5_tokenizer_model.py
│           │   │   ├── question-answering/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_qa.py
│           │   │   │   └── utils_qa.py
│           │   │   ├── summarization/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   └── run_summarization_flax.py
│           │   │   ├── test_flax_examples.py
│           │   │   ├── text-classification/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   └── run_flax_glue.py
│           │   │   ├── token-classification/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   └── run_flax_ner.py
│           │   │   └── vision/
│           │   │       ├── README.md
│           │   │       ├── requirements.txt
│           │   │       └── run_image_classification.py
│           │   ├── legacy/
│           │   │   ├── README.md
│           │   │   ├── multiple_choice/
│           │   │   │   ├── run_multiple_choice.py
│           │   │   │   └── utils_multiple_choice.py
│           │   │   ├── pytorch-lightning/
│           │   │   │   ├── lightning_base.py
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_glue.py
│           │   │   │   ├── run_glue.sh
│           │   │   │   ├── run_ner.py
│           │   │   │   ├── run_ner.sh
│           │   │   │   └── run_pos.sh
│           │   │   ├── question-answering/
│           │   │   │   ├── README.md
│           │   │   │   ├── run_squad.py
│           │   │   │   └── run_squad_trainer.py
│           │   │   ├── run_camembert.py
│           │   │   ├── run_chinese_ref.py
│           │   │   ├── run_language_modeling.py
│           │   │   ├── run_openai_gpt.py
│           │   │   ├── run_swag.py
│           │   │   ├── run_transfo_xl.py
│           │   │   ├── seq2seq/
│           │   │   │   ├── README.md
│           │   │   │   ├── __init__.py
│           │   │   │   ├── convert_model_to_fp16.py
│           │   │   │   ├── download_wmt.py
│           │   │   │   ├── finetune.sh
│           │   │   │   ├── finetune_tpu.sh
│           │   │   │   ├── finetune_trainer.py
│           │   │   │   ├── minify_dataset.py
│           │   │   │   ├── old_test_calculate_rouge.py
│           │   │   │   ├── old_test_datasets.py
│           │   │   │   ├── old_test_fsmt_bleu_score.py
│           │   │   │   ├── old_test_seq2seq_examples.py
│           │   │   │   ├── old_test_seq2seq_examples_multi_gpu.py
│           │   │   │   ├── old_test_tatoeba_conversion.py
│           │   │   │   ├── pack_dataset.py
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── romanian_postprocessing.md
│           │   │   │   ├── rouge_cli.py
│           │   │   │   ├── run_distributed_eval.py
│           │   │   │   ├── run_eval.py
│           │   │   │   ├── run_eval_search.py
│           │   │   │   ├── save_len_file.py
│           │   │   │   ├── save_randomly_initialized_model.py
│           │   │   │   ├── sentence_splitter.py
│           │   │   │   ├── seq2seq_trainer.py
│           │   │   │   ├── seq2seq_training_args.py
│           │   │   │   ├── test_data/
│           │   │   │   │   ├── fsmt/
│           │   │   │   │   │   ├── build-eval-data.py
│           │   │   │   │   │   └── fsmt_val_data.json
│           │   │   │   │   └── wmt_en_ro/
│           │   │   │   │       ├── test.source
│           │   │   │   │       ├── test.target
│           │   │   │   │       ├── train.len
│           │   │   │   │       ├── train.source
│           │   │   │   │       ├── train.target
│           │   │   │   │       ├── val.len
│           │   │   │   │       ├── val.source
│           │   │   │   │       └── val.target
│           │   │   │   ├── train_distil_marian_enro.sh
│           │   │   │   ├── train_distil_marian_enro_tpu.sh
│           │   │   │   ├── train_distilbart_cnn.sh
│           │   │   │   ├── train_mbart_cc25_enro.sh
│           │   │   │   ├── utils.py
│           │   │   │   └── xla_spawn.py
│           │   │   ├── text-classification/
│           │   │   │   └── run_tf_text_classification.py
│           │   │   └── token-classification/
│           │   │       ├── README.md
│           │   │       ├── run.sh
│           │   │       ├── run_chunk.sh
│           │   │       ├── run_ner.py
│           │   │       ├── run_pos.sh
│           │   │       ├── run_tf_ner.py
│           │   │       ├── scripts/
│           │   │       │   └── preprocess.py
│           │   │       ├── tasks.py
│           │   │       └── utils_ner.py
│           │   ├── pytorch/
│           │   │   ├── README.md
│           │   │   ├── _tests_requirements.txt
│           │   │   ├── audio-classification/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   └── run_audio_classification.py
│           │   │   ├── benchmarking/
│           │   │   │   ├── README.md
│           │   │   │   ├── plot_csv_file.py
│           │   │   │   ├── requirements.txt
│           │   │   │   └── run_benchmark.py
│           │   │   ├── conftest.py
│           │   │   ├── contrastive-image-text/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   └── run_clip.py
│           │   │   ├── image-classification/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_image_classification.py
│           │   │   │   └── run_image_classification_no_trainer.py
│           │   │   ├── image-pretraining/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_mae.py
│           │   │   │   └── run_mim.py
│           │   │   ├── language-modeling/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_clm.py
│           │   │   │   ├── run_clm_no_trainer.py
│           │   │   │   ├── run_mlm.py
│           │   │   │   ├── run_mlm_no_trainer.py
│           │   │   │   └── run_plm.py
│           │   │   ├── multiple-choice/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_no_trainer.sh
│           │   │   │   ├── run_swag.py
│           │   │   │   └── run_swag_no_trainer.py
│           │   │   ├── question-answering/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_qa.py
│           │   │   │   ├── run_qa_beam_search.py
│           │   │   │   ├── run_qa_beam_search_no_trainer.py
│           │   │   │   ├── run_qa_no_trainer.py
│           │   │   │   ├── run_seq2seq_qa.py
│           │   │   │   ├── trainer_qa.py
│           │   │   │   ├── trainer_seq2seq_qa.py
│           │   │   │   └── utils_qa.py
│           │   │   ├── semantic-segmentation/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_semantic_segmentation.py
│           │   │   │   └── run_semantic_segmentation_no_trainer.py
│           │   │   ├── speech-pretraining/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   └── run_wav2vec2_pretraining_no_trainer.py
│           │   │   ├── speech-recognition/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_speech_recognition_ctc.py
│           │   │   │   └── run_speech_recognition_seq2seq.py
│           │   │   ├── summarization/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_summarization.py
│           │   │   │   └── run_summarization_no_trainer.py
│           │   │   ├── test_accelerate_examples.py
│           │   │   ├── test_pytorch_examples.py
│           │   │   ├── test_xla_examples.py
│           │   │   ├── text-classification/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_glue.py
│           │   │   │   ├── run_glue_no_trainer.py
│           │   │   │   └── run_xnli.py
│           │   │   ├── text-generation/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_generation.py
│           │   │   │   └── run_generation_contrastive_search.py
│           │   │   ├── token-classification/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run.sh
│           │   │   │   ├── run_ner.py
│           │   │   │   ├── run_ner_no_trainer.py
│           │   │   │   └── run_no_trainer.sh
│           │   │   ├── translation/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_translation.py
│           │   │   │   └── run_translation_no_trainer.py
│           │   │   └── xla_spawn.py
│           │   ├── research_projects/
│           │   │   ├── README.md
│           │   │   ├── adversarial/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_hans.py
│           │   │   │   └── utils_hans.py
│           │   │   ├── bert-loses-patience/
│           │   │   │   ├── README.md
│           │   │   │   ├── pabee/
│           │   │   │   │   ├── __init__.py
│           │   │   │   │   ├── modeling_pabee_albert.py
│           │   │   │   │   └── modeling_pabee_bert.py
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_glue_with_pabee.py
│           │   │   │   └── test_run_glue_with_pabee.py
│           │   │   ├── bertabs/
│           │   │   │   ├── README.md
│           │   │   │   ├── __init__.py
│           │   │   │   ├── configuration_bertabs.py
│           │   │   │   ├── convert_bertabs_original_pytorch_checkpoint.py
│           │   │   │   ├── modeling_bertabs.py
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_summarization.py
│           │   │   │   ├── test_utils_summarization.py
│           │   │   │   └── utils_summarization.py
│           │   │   ├── bertology/
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_bertology.py
│           │   │   │   └── run_prune_gpt.py
│           │   │   ├── codeparrot/
│           │   │   │   ├── README.md
│           │   │   │   ├── examples/
│           │   │   │   │   ├── README.md
│           │   │   │   │   ├── requirements.txt
│           │   │   │   │   └── train_complexity_predictor.py
│           │   │   │   ├── requirements.txt
│           │   │   │   └── scripts/
│           │   │   │       ├── arguments.py
│           │   │   │       ├── bpe_training.py
│           │   │   │       ├── codeparrot_training.py
│           │   │   │       ├── human_eval.py
│           │   │   │       ├── initialize_model.py
│           │   │   │       ├── minhash_deduplication.py
│           │   │   │       ├── preprocessing.py
│           │   │   │       ├── pretokenizing.py
│           │   │   │       ├── tests/
│           │   │   │       │   ├── __init__.py
│           │   │   │       │   └── test_deduplicate.py
│           │   │   │       └── validation_loss.py
│           │   │   ├── decision_transformer/
│           │   │   │   ├── requirements.txt
│           │   │   │   └── run_decision_transformer.py
│           │   │   ├── deebert/
│           │   │   │   ├── README.md
│           │   │   │   ├── entropy_eval.sh
│           │   │   │   ├── eval_deebert.sh
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_glue_deebert.py
│           │   │   │   ├── src/
│           │   │   │   │   ├── __init__.py
│           │   │   │   │   ├── modeling_highway_bert.py
│           │   │   │   │   └── modeling_highway_roberta.py
│           │   │   │   ├── test_glue_deebert.py
│           │   │   │   └── train_deebert.sh
│           │   │   ├── distillation/
│           │   │   │   ├── README.md
│           │   │   │   ├── distiller.py
│           │   │   │   ├── grouped_batch_sampler.py
│           │   │   │   ├── lm_seqs_dataset.py
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_squad_w_distillation.py
│           │   │   │   ├── scripts/
│           │   │   │   │   ├── binarized_data.py
│           │   │   │   │   ├── extract.py
│           │   │   │   │   ├── extract_distilbert.py
│           │   │   │   │   └── token_counts.py
│           │   │   │   ├── train.py
│           │   │   │   ├── training_configs/
│           │   │   │   │   ├── distilbert-base-cased.json
│           │   │   │   │   ├── distilbert-base-multilingual-cased.json
│           │   │   │   │   ├── distilbert-base-uncased.json
│           │   │   │   │   ├── distilgpt2.json
│           │   │   │   │   └── distilroberta-base.json
│           │   │   │   └── utils.py
│           │   │   ├── fsner/
│           │   │   │   ├── README.md
│           │   │   │   ├── pyproject.toml
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── setup.py
│           │   │   │   └── src/
│           │   │   │       └── fsner/
│           │   │   │           ├── __init__.py
│           │   │   │           ├── model.py
│           │   │   │           └── tokenizer_utils.py
│           │   │   ├── information-gain-filtration/
│           │   │   │   ├── README.md
│           │   │   │   ├── igf/
│           │   │   │   │   ├── __init__.py
│           │   │   │   │   └── igf.py
│           │   │   │   ├── requirements.txt
│           │   │   │   └── run_clm_igf.py
│           │   │   ├── jax-projects/
│           │   │   │   ├── HOW_TO_PROPOSE_PROJECT.md
│           │   │   │   ├── README.md
│           │   │   │   ├── big_bird/
│           │   │   │   │   ├── README.md
│           │   │   │   │   ├── bigbird_flax.py
│           │   │   │   │   ├── evaluate.py
│           │   │   │   │   ├── prepare_natural_questions.py
│           │   │   │   │   ├── requirements.txt
│           │   │   │   │   ├── sweep_flax.yaml
│           │   │   │   │   └── train.py
│           │   │   │   ├── dataset-streaming/
│           │   │   │   │   ├── README.md
│           │   │   │   │   └── run_mlm_flax_stream.py
│           │   │   │   ├── hybrid_clip/
│           │   │   │   │   ├── README.md
│           │   │   │   │   ├── configuration_hybrid_clip.py
│           │   │   │   │   ├── modeling_hybrid_clip.py
│           │   │   │   │   ├── requirements.txt
│           │   │   │   │   └── run_hybrid_clip.py
│           │   │   │   ├── model_parallel/
│           │   │   │   │   ├── README.md
│           │   │   │   │   ├── partitions.py
│           │   │   │   │   └── run_clm_mp.py
│           │   │   │   └── wav2vec2/
│           │   │   │       ├── README.md
│           │   │   │       └── run_wav2vec2_pretrain_flax.py
│           │   │   ├── layoutlmv3/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   └── run_funsd_cord.py
│           │   │   ├── longform-qa/
│           │   │   │   ├── README.md
│           │   │   │   ├── eli5_app.py
│           │   │   │   ├── eli5_utils.py
│           │   │   │   └── requirements.txt
│           │   │   ├── luke/
│           │   │   │   ├── README.md
│           │   │   │   ├── luke_utils.py
│           │   │   │   └── run_luke_ner_no_trainer.py
│           │   │   ├── lxmert/
│           │   │   │   ├── README.md
│           │   │   │   ├── demo.ipynb
│           │   │   │   ├── extracting_data.py
│           │   │   │   ├── modeling_frcnn.py
│           │   │   │   ├── processing_image.py
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── utils.py
│           │   │   │   └── visualizing_image.py
│           │   │   ├── mlm_wwm/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_chinese_ref.py
│           │   │   │   └── run_mlm_wwm.py
│           │   │   ├── mm-imdb/
│           │   │   │   ├── README.md
│           │   │   │   ├── run_mmimdb.py
│           │   │   │   └── utils_mmimdb.py
│           │   │   ├── movement-pruning/
│           │   │   │   ├── README.md
│           │   │   │   ├── Saving_PruneBERT.ipynb
│           │   │   │   ├── bertarize.py
│           │   │   │   ├── counts_parameters.py
│           │   │   │   ├── emmental/
│           │   │   │   │   ├── __init__.py
│           │   │   │   │   ├── configuration_bert_masked.py
│           │   │   │   │   ├── modeling_bert_masked.py
│           │   │   │   │   └── modules/
│           │   │   │   │       ├── __init__.py
│           │   │   │   │       ├── binarizer.py
│           │   │   │   │       └── masked_nn.py
│           │   │   │   ├── masked_run_glue.py
│           │   │   │   ├── masked_run_squad.py
│           │   │   │   └── requirements.txt
│           │   │   ├── onnx/
│           │   │   │   └── summarization/
│           │   │   │       ├── README.md
│           │   │   │       ├── bart_onnx/
│           │   │   │       │   ├── generation_onnx.py
│           │   │   │       │   └── reduce_onnx_size.py
│           │   │   │       ├── requirements.txt
│           │   │   │       └── run_onnx_exporter.py
│           │   │   ├── performer/
│           │   │   │   ├── README.md
│           │   │   │   ├── full_script.sh
│           │   │   │   ├── modeling_flax_performer.py
│           │   │   │   ├── modeling_flax_performer_utils.py
│           │   │   │   ├── run_mlm_performer.py
│           │   │   │   └── sanity_script.sh
│           │   │   ├── pplm/
│           │   │   │   ├── README.md
│           │   │   │   ├── pplm_classification_head.py
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_pplm.py
│           │   │   │   └── run_pplm_discrim_train.py
│           │   │   ├── quantization-qdqbert/
│           │   │   │   ├── Dockerfile
│           │   │   │   ├── README.md
│           │   │   │   ├── evaluate-hf-trt-qa.py
│           │   │   │   ├── ort-infer-benchmark.py
│           │   │   │   ├── quant_trainer.py
│           │   │   │   ├── run_quant_qa.py
│           │   │   │   ├── trainer_quant_qa.py
│           │   │   │   └── utils_qa.py
│           │   │   ├── rag/
│           │   │   │   ├── README.md
│           │   │   │   ├── __init__.py
│           │   │   │   ├── _test_finetune_rag.py
│           │   │   │   ├── callbacks_rag.py
│           │   │   │   ├── consolidate_rag_checkpoint.py
│           │   │   │   ├── distributed_pytorch_retriever.py
│           │   │   │   ├── distributed_ray_retriever.py
│           │   │   │   ├── eval_rag.py
│           │   │   │   ├── finetune_rag.py
│           │   │   │   ├── finetune_rag.sh
│           │   │   │   ├── finetune_rag_ray.sh
│           │   │   │   ├── lightning_base.py
│           │   │   │   ├── parse_dpr_relevance_data.py
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── test_data/
│           │   │   │   │   └── my_knowledge_dataset.csv
│           │   │   │   ├── test_distributed_retriever.py
│           │   │   │   ├── use_own_knowledge_dataset.py
│           │   │   │   └── utils_rag.py
│           │   │   ├── rag-end2end-retriever/
│           │   │   │   ├── README.md
│           │   │   │   ├── callbacks_rag.py
│           │   │   │   ├── distributed_ray_retriever.py
│           │   │   │   ├── eval_rag.py
│           │   │   │   ├── finetune_rag.py
│           │   │   │   ├── finetune_rag_ray_end2end.sh
│           │   │   │   ├── kb_encode_utils.py
│           │   │   │   ├── lightning_base.py
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── test_run/
│           │   │   │   │   ├── dummy-kb/
│           │   │   │   │   │   └── my_knowledge_dataset.csv
│           │   │   │   │   ├── dummy-train-data/
│           │   │   │   │   │   ├── train.source
│           │   │   │   │   │   ├── train.target
│           │   │   │   │   │   ├── val.source
│           │   │   │   │   │   └── val.target
│           │   │   │   │   ├── test_finetune.sh
│           │   │   │   │   └── test_rag_new_features.sh
│           │   │   │   ├── use_own_knowledge_dataset.py
│           │   │   │   └── utils_rag.py
│           │   │   ├── robust-speech-event/
│           │   │   │   ├── README.md
│           │   │   │   ├── eval.py
│           │   │   │   ├── run_speech_recognition_ctc_bnb.py
│           │   │   │   └── run_speech_recognition_ctc_streaming.py
│           │   │   ├── self-training-text-classification/
│           │   │   │   ├── README.md
│           │   │   │   ├── finetuning.py
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run.sh
│           │   │   │   └── selftraining.py
│           │   │   ├── seq2seq-distillation/
│           │   │   │   ├── README.md
│           │   │   │   ├── _test_bash_script.py
│           │   │   │   ├── _test_make_student.py
│           │   │   │   ├── _test_seq2seq_examples.py
│           │   │   │   ├── _test_seq2seq_examples_multi_gpu.py
│           │   │   │   ├── callbacks.py
│           │   │   │   ├── convert_pl_checkpoint_to_hf.py
│           │   │   │   ├── distil_marian_enro_teacher.sh
│           │   │   │   ├── distil_marian_no_teacher.sh
│           │   │   │   ├── distillation.py
│           │   │   │   ├── dynamic_bs_example.sh
│           │   │   │   ├── finetune.py
│           │   │   │   ├── finetune.sh
│           │   │   │   ├── finetune_bart_tiny.sh
│           │   │   │   ├── finetune_pegasus_xsum.sh
│           │   │   │   ├── finetune_t5.sh
│           │   │   │   ├── lightning_base.py
│           │   │   │   ├── make_student.py
│           │   │   │   ├── precomputed_pseudo_labels.md
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_eval.py
│           │   │   │   ├── sentence_splitter.py
│           │   │   │   ├── train_distilbart_cnn.sh
│           │   │   │   ├── train_distilbart_xsum.sh
│           │   │   │   ├── train_mbart_cc25_enro.sh
│           │   │   │   └── utils.py
│           │   │   ├── tapex/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_tabfact_with_tapex.py
│           │   │   │   ├── run_wikisql_with_tapex.py
│           │   │   │   ├── run_wikitablequestions_with_tapex.py
│           │   │   │   └── wikisql_utils.py
│           │   │   ├── visual_bert/
│           │   │   │   ├── README.md
│           │   │   │   ├── demo.ipynb
│           │   │   │   ├── extracting_data.py
│           │   │   │   ├── modeling_frcnn.py
│           │   │   │   ├── processing_image.py
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── utils.py
│           │   │   │   └── visualizing_image.py
│           │   │   ├── wav2vec2/
│           │   │   │   ├── FINE_TUNE_XLSR_WAV2VEC2.md
│           │   │   │   ├── README.md
│           │   │   │   ├── alignment.py
│           │   │   │   ├── ds_config_wav2vec2_zero2.json
│           │   │   │   ├── ds_config_wav2vec2_zero3.json
│           │   │   │   ├── finetune_base_100.sh
│           │   │   │   ├── finetune_base_timit_asr.sh
│           │   │   │   ├── finetune_large_lv60_100.sh
│           │   │   │   ├── finetune_large_lv60_timit_asr.sh
│           │   │   │   ├── finetune_large_xlsr_53_arabic_speech_corpus.sh
│           │   │   │   ├── finetune_wav2vec2_xlsr_turkish.sh
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_alignment.sh
│           │   │   │   ├── run_asr.py
│           │   │   │   ├── run_common_voice.py
│           │   │   │   ├── run_pretrain.py
│           │   │   │   ├── test_wav2vec2_deepspeed.py
│           │   │   │   └── vocab/
│           │   │   │       └── buckwalter.json
│           │   │   ├── xtreme-s/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   └── run_xtreme_s.py
│           │   │   └── zero-shot-distillation/
│           │   │       ├── README.md
│           │   │       └── distill_classifier.py
│           │   └── tensorflow/
│           │       ├── README.md
│           │       ├── _tests_requirements.txt
│           │       ├── benchmarking/
│           │       │   ├── README.md
│           │       │   ├── plot_csv_file.py
│           │       │   ├── requirements.txt
│           │       │   └── run_benchmark_tf.py
│           │       ├── language-modeling/
│           │       │   ├── README.md
│           │       │   ├── requirements.txt
│           │       │   ├── run_clm.py
│           │       │   └── run_mlm.py
│           │       ├── multiple-choice/
│           │       │   ├── README.md
│           │       │   ├── requirements.txt
│           │       │   └── run_swag.py
│           │       ├── question-answering/
│           │       │   ├── README.md
│           │       │   ├── requirements.txt
│           │       │   ├── run_qa.py
│           │       │   └── utils_qa.py
│           │       ├── summarization/
│           │       │   ├── README.md
│           │       │   ├── requirements.txt
│           │       │   └── run_summarization.py
│           │       ├── test_tensorflow_examples.py
│           │       ├── text-classification/
│           │       │   ├── README.md
│           │       │   ├── requirements.txt
│           │       │   ├── run_glue.py
│           │       │   └── run_text_classification.py
│           │       ├── token-classification/
│           │       │   ├── README.md
│           │       │   ├── requirements.txt
│           │       │   └── run_ner.py
│           │       └── translation/
│           │           ├── README.md
│           │           ├── requirements.txt
│           │           └── run_translation.py
│           ├── hubconf.py
│           ├── model_cards/
│           │   └── README.md
│           ├── notebooks/
│           │   └── README.md
│           ├── pyproject.toml
│           ├── scripts/
│           │   ├── benchmark/
│           │   │   └── trainer-benchmark.py
│           │   ├── check_tokenizers.py
│           │   ├── distributed/
│           │   │   └── torch-distributed-gpu-test.py
│           │   ├── fsmt/
│           │   │   ├── convert-allenai-wmt16.sh
│           │   │   ├── convert-allenai-wmt19.sh
│           │   │   ├── convert-facebook-wmt19.sh
│           │   │   ├── eval-allenai-wmt16.sh
│           │   │   ├── eval-allenai-wmt19.sh
│           │   │   ├── eval-facebook-wmt19.sh
│           │   │   ├── fsmt-make-super-tiny-model.py
│           │   │   ├── fsmt-make-tiny-model.py
│           │   │   ├── gen-card-allenai-wmt16.py
│           │   │   ├── gen-card-allenai-wmt19.py
│           │   │   ├── gen-card-facebook-wmt19.py
│           │   │   ├── s3-move.sh
│           │   │   └── tests-to-run.sh
│           │   ├── pegasus/
│           │   │   └── build_test_sample_spm_no_bos.py
│           │   ├── stale.py
│           │   └── tatoeba/
│           │       ├── README.md
│           │       └── upload_models.sh
│           ├── setup.cfg
│           ├── setup.py
│           ├── src/
│           │   └── transformers/
│           │       ├── __init__.py
│           │       ├── activations.py
│           │       ├── activations_tf.py
│           │       ├── benchmark/
│           │       │   ├── __init__.py
│           │       │   ├── benchmark.py
│           │       │   ├── benchmark_args.py
│           │       │   ├── benchmark_args_tf.py
│           │       │   ├── benchmark_args_utils.py
│           │       │   ├── benchmark_tf.py
│           │       │   └── benchmark_utils.py
│           │       ├── commands/
│           │       │   ├── __init__.py
│           │       │   ├── add_new_model.py
│           │       │   ├── add_new_model_like.py
│           │       │   ├── convert.py
│           │       │   ├── download.py
│           │       │   ├── env.py
│           │       │   ├── lfs.py
│           │       │   ├── pt_to_tf.py
│           │       │   ├── run.py
│           │       │   ├── serving.py
│           │       │   ├── train.py
│           │       │   ├── transformers_cli.py
│           │       │   └── user.py
│           │       ├── configuration_utils.py
│           │       ├── convert_graph_to_onnx.py
│           │       ├── convert_pytorch_checkpoint_to_tf2.py
│           │       ├── convert_slow_tokenizer.py
│           │       ├── convert_slow_tokenizers_checkpoints_to_fast.py
│           │       ├── convert_tf_hub_seq_to_seq_bert_to_pytorch.py
│           │       ├── data/
│           │       │   ├── __init__.py
│           │       │   ├── data_collator.py
│           │       │   ├── datasets/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── glue.py
│           │       │   │   ├── language_modeling.py
│           │       │   │   └── squad.py
│           │       │   ├── metrics/
│           │       │   │   ├── __init__.py
│           │       │   │   └── squad_metrics.py
│           │       │   ├── processors/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── glue.py
│           │       │   │   ├── squad.py
│           │       │   │   ├── utils.py
│           │       │   │   └── xnli.py
│           │       │   └── test_generation_utils.py
│           │       ├── debug_utils.py
│           │       ├── deepspeed.py
│           │       ├── dependency_versions_check.py
│           │       ├── dependency_versions_table.py
│           │       ├── dynamic_module_utils.py
│           │       ├── feature_extraction_sequence_utils.py
│           │       ├── feature_extraction_utils.py
│           │       ├── file_utils.py
│           │       ├── generation_beam_constraints.py
│           │       ├── generation_beam_search.py
│           │       ├── generation_flax_logits_process.py
│           │       ├── generation_flax_utils.py
│           │       ├── generation_logits_process.py
│           │       ├── generation_stopping_criteria.py
│           │       ├── generation_tf_logits_process.py
│           │       ├── generation_tf_utils.py
│           │       ├── generation_utils.py
│           │       ├── hf_argparser.py
│           │       ├── image_processing_utils.py
│           │       ├── image_transforms.py
│           │       ├── image_utils.py
│           │       ├── integrations.py
│           │       ├── keras_callbacks.py
│           │       ├── modelcard.py
│           │       ├── modeling_flax_outputs.py
│           │       ├── modeling_flax_pytorch_utils.py
│           │       ├── modeling_flax_utils.py
│           │       ├── modeling_outputs.py
│           │       ├── modeling_tf_outputs.py
│           │       ├── modeling_tf_pytorch_utils.py
│           │       ├── modeling_tf_utils.py
│           │       ├── modeling_utils.py
│           │       ├── models/
│           │       │   ├── __init__.py
│           │       │   ├── albert/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_albert.py
│           │       │   │   ├── convert_albert_original_tf_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_albert.py
│           │       │   │   ├── modeling_flax_albert.py
│           │       │   │   ├── modeling_tf_albert.py
│           │       │   │   ├── tokenization_albert.py
│           │       │   │   └── tokenization_albert_fast.py
│           │       │   ├── auto/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── auto_factory.py
│           │       │   │   ├── configuration_auto.py
│           │       │   │   ├── feature_extraction_auto.py
│           │       │   │   ├── modeling_auto.py
│           │       │   │   ├── modeling_flax_auto.py
│           │       │   │   ├── modeling_tf_auto.py
│           │       │   │   ├── processing_auto.py
│           │       │   │   └── tokenization_auto.py
│           │       │   ├── bart/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_bart.py
│           │       │   │   ├── convert_bart_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_bart.py
│           │       │   │   ├── modeling_flax_bart.py
│           │       │   │   ├── modeling_tf_bart.py
│           │       │   │   ├── tokenization_bart.py
│           │       │   │   └── tokenization_bart_fast.py
│           │       │   ├── barthez/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── tokenization_barthez.py
│           │       │   │   └── tokenization_barthez_fast.py
│           │       │   ├── bartpho/
│           │       │   │   ├── __init__.py
│           │       │   │   └── tokenization_bartpho.py
│           │       │   ├── beit/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_beit.py
│           │       │   │   ├── convert_beit_unilm_to_pytorch.py
│           │       │   │   ├── feature_extraction_beit.py
│           │       │   │   ├── modeling_beit.py
│           │       │   │   └── modeling_flax_beit.py
│           │       │   ├── bert/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_bert.py
│           │       │   │   ├── convert_bert_original_tf2_checkpoint_to_pytorch.py
│           │       │   │   ├── convert_bert_original_tf_checkpoint_to_pytorch.py
│           │       │   │   ├── convert_bert_pytorch_checkpoint_to_original_tf.py
│           │       │   │   ├── convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_bert.py
│           │       │   │   ├── modeling_flax_bert.py
│           │       │   │   ├── modeling_tf_bert.py
│           │       │   │   ├── tokenization_bert.py
│           │       │   │   ├── tokenization_bert_fast.py
│           │       │   │   └── tokenization_bert_tf.py
│           │       │   ├── bert_generation/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_bert_generation.py
│           │       │   │   ├── modeling_bert_generation.py
│           │       │   │   └── tokenization_bert_generation.py
│           │       │   ├── bert_japanese/
│           │       │   │   ├── __init__.py
│           │       │   │   └── tokenization_bert_japanese.py
│           │       │   ├── bertweet/
│           │       │   │   ├── __init__.py
│           │       │   │   └── tokenization_bertweet.py
│           │       │   ├── big_bird/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_big_bird.py
│           │       │   │   ├── convert_bigbird_original_tf_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_big_bird.py
│           │       │   │   ├── modeling_flax_big_bird.py
│           │       │   │   ├── tokenization_big_bird.py
│           │       │   │   └── tokenization_big_bird_fast.py
│           │       │   ├── bigbird_pegasus/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_bigbird_pegasus.py
│           │       │   │   ├── convert_bigbird_pegasus_tf_to_pytorch.py
│           │       │   │   └── modeling_bigbird_pegasus.py
│           │       │   ├── blenderbot/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_blenderbot.py
│           │       │   │   ├── convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_blenderbot.py
│           │       │   │   ├── modeling_flax_blenderbot.py
│           │       │   │   ├── modeling_tf_blenderbot.py
│           │       │   │   ├── tokenization_blenderbot.py
│           │       │   │   └── tokenization_blenderbot_fast.py
│           │       │   ├── blenderbot_small/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_blenderbot_small.py
│           │       │   │   ├── modeling_blenderbot_small.py
│           │       │   │   ├── modeling_flax_blenderbot_small.py
│           │       │   │   ├── modeling_tf_blenderbot_small.py
│           │       │   │   ├── tokenization_blenderbot_small.py
│           │       │   │   └── tokenization_blenderbot_small_fast.py
│           │       │   ├── bloom/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_bloom.py
│           │       │   │   ├── convert_bloom_original_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_bloom.py
│           │       │   │   └── tokenization_bloom_fast.py
│           │       │   ├── bort/
│           │       │   │   ├── __init__.py
│           │       │   │   └── convert_bort_original_gluonnlp_checkpoint_to_pytorch.py
│           │       │   ├── byt5/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── convert_byt5_original_tf_checkpoint_to_pytorch.py
│           │       │   │   └── tokenization_byt5.py
│           │       │   ├── camembert/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_camembert.py
│           │       │   │   ├── modeling_camembert.py
│           │       │   │   ├── modeling_tf_camembert.py
│           │       │   │   ├── tokenization_camembert.py
│           │       │   │   └── tokenization_camembert_fast.py
│           │       │   ├── canine/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_canine.py
│           │       │   │   ├── convert_canine_original_tf_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_canine.py
│           │       │   │   └── tokenization_canine.py
│           │       │   ├── clip/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_clip.py
│           │       │   │   ├── convert_clip_original_pytorch_to_hf.py
│           │       │   │   ├── feature_extraction_clip.py
│           │       │   │   ├── modeling_clip.py
│           │       │   │   ├── modeling_flax_clip.py
│           │       │   │   ├── modeling_tf_clip.py
│           │       │   │   ├── processing_clip.py
│           │       │   │   ├── tokenization_clip.py
│           │       │   │   └── tokenization_clip_fast.py
│           │       │   ├── codegen/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_codegen.py
│           │       │   │   ├── modeling_codegen.py
│           │       │   │   ├── tokenization_codegen.py
│           │       │   │   └── tokenization_codegen_fast.py
│           │       │   ├── conditional_detr/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_conditional_detr.py
│           │       │   │   ├── convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   ├── feature_extraction_conditional_detr.py
│           │       │   │   └── modeling_conditional_detr.py
│           │       │   ├── convbert/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_convbert.py
│           │       │   │   ├── convert_convbert_original_tf1_checkpoint_to_pytorch_and_tf2.py
│           │       │   │   ├── modeling_convbert.py
│           │       │   │   ├── modeling_tf_convbert.py
│           │       │   │   ├── tokenization_convbert.py
│           │       │   │   └── tokenization_convbert_fast.py
│           │       │   ├── convnext/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_convnext.py
│           │       │   │   ├── convert_convnext_to_pytorch.py
│           │       │   │   ├── feature_extraction_convnext.py
│           │       │   │   ├── modeling_convnext.py
│           │       │   │   └── modeling_tf_convnext.py
│           │       │   ├── cpm/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── tokenization_cpm.py
│           │       │   │   └── tokenization_cpm_fast.py
│           │       │   ├── ctrl/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_ctrl.py
│           │       │   │   ├── modeling_ctrl.py
│           │       │   │   ├── modeling_tf_ctrl.py
│           │       │   │   └── tokenization_ctrl.py
│           │       │   ├── cvt/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_cvt.py
│           │       │   │   ├── convert_cvt_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_cvt.py
│           │       │   │   └── modeling_tf_cvt.py
│           │       │   ├── data2vec/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_data2vec_audio.py
│           │       │   │   ├── configuration_data2vec_text.py
│           │       │   │   ├── configuration_data2vec_vision.py
│           │       │   │   ├── convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   ├── convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   ├── convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_data2vec_audio.py
│           │       │   │   ├── modeling_data2vec_text.py
│           │       │   │   ├── modeling_data2vec_vision.py
│           │       │   │   └── modeling_tf_data2vec_vision.py
│           │       │   ├── deberta/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_deberta.py
│           │       │   │   ├── modeling_deberta.py
│           │       │   │   ├── modeling_tf_deberta.py
│           │       │   │   ├── tokenization_deberta.py
│           │       │   │   └── tokenization_deberta_fast.py
│           │       │   ├── deberta_v2/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_deberta_v2.py
│           │       │   │   ├── modeling_deberta_v2.py
│           │       │   │   ├── modeling_tf_deberta_v2.py
│           │       │   │   ├── tokenization_deberta_v2.py
│           │       │   │   └── tokenization_deberta_v2_fast.py
│           │       │   ├── decision_transformer/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_decision_transformer.py
│           │       │   │   └── modeling_decision_transformer.py
│           │       │   ├── deformable_detr/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_deformable_detr.py
│           │       │   │   ├── convert_deformable_detr_to_pytorch.py
│           │       │   │   ├── custom_kernel/
│           │       │   │   │   ├── cpu/
│           │       │   │   │   │   ├── ms_deform_attn_cpu.cpp
│           │       │   │   │   │   └── ms_deform_attn_cpu.h
│           │       │   │   │   ├── cuda/
│           │       │   │   │   │   ├── ms_deform_attn_cuda.cu
│           │       │   │   │   │   ├── ms_deform_attn_cuda.cuh
│           │       │   │   │   │   ├── ms_deform_attn_cuda.h
│           │       │   │   │   │   └── ms_deform_im2col_cuda.cuh
│           │       │   │   │   ├── ms_deform_attn.h
│           │       │   │   │   └── vision.cpp
│           │       │   │   ├── feature_extraction_deformable_detr.py
│           │       │   │   ├── load_custom.py
│           │       │   │   └── modeling_deformable_detr.py
│           │       │   ├── deit/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_deit.py
│           │       │   │   ├── convert_deit_timm_to_pytorch.py
│           │       │   │   ├── feature_extraction_deit.py
│           │       │   │   ├── modeling_deit.py
│           │       │   │   └── modeling_tf_deit.py
│           │       │   ├── detr/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_detr.py
│           │       │   │   ├── convert_detr_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   ├── feature_extraction_detr.py
│           │       │   │   └── modeling_detr.py
│           │       │   ├── dialogpt/
│           │       │   │   ├── __init__.py
│           │       │   │   └── convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py
│           │       │   ├── distilbert/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_distilbert.py
│           │       │   │   ├── modeling_distilbert.py
│           │       │   │   ├── modeling_flax_distilbert.py
│           │       │   │   ├── modeling_tf_distilbert.py
│           │       │   │   ├── tokenization_distilbert.py
│           │       │   │   └── tokenization_distilbert_fast.py
│           │       │   ├── dit/
│           │       │   │   ├── __init__.py
│           │       │   │   └── convert_dit_unilm_to_pytorch.py
│           │       │   ├── donut/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_donut_swin.py
│           │       │   │   ├── convert_donut_to_pytorch.py
│           │       │   │   ├── feature_extraction_donut.py
│           │       │   │   ├── modeling_donut_swin.py
│           │       │   │   └── processing_donut.py
│           │       │   ├── dpr/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_dpr.py
│           │       │   │   ├── convert_dpr_original_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_dpr.py
│           │       │   │   ├── modeling_tf_dpr.py
│           │       │   │   ├── tokenization_dpr.py
│           │       │   │   └── tokenization_dpr_fast.py
│           │       │   ├── dpt/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_dpt.py
│           │       │   │   ├── convert_dpt_to_pytorch.py
│           │       │   │   ├── feature_extraction_dpt.py
│           │       │   │   └── modeling_dpt.py
│           │       │   ├── electra/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_electra.py
│           │       │   │   ├── convert_electra_original_tf_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_electra.py
│           │       │   │   ├── modeling_flax_electra.py
│           │       │   │   ├── modeling_tf_electra.py
│           │       │   │   ├── tokenization_electra.py
│           │       │   │   └── tokenization_electra_fast.py
│           │       │   ├── encoder_decoder/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_encoder_decoder.py
│           │       │   │   ├── modeling_encoder_decoder.py
│           │       │   │   ├── modeling_flax_encoder_decoder.py
│           │       │   │   └── modeling_tf_encoder_decoder.py
│           │       │   ├── ernie/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_ernie.py
│           │       │   │   └── modeling_ernie.py
│           │       │   ├── esm/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_esm.py
│           │       │   │   ├── convert_esm.py
│           │       │   │   ├── modeling_esm.py
│           │       │   │   ├── modeling_esmfold.py
│           │       │   │   ├── modeling_tf_esm.py
│           │       │   │   ├── openfold_utils/
│           │       │   │   │   ├── __init__.py
│           │       │   │   │   ├── chunk_utils.py
│           │       │   │   │   ├── data_transforms.py
│           │       │   │   │   ├── feats.py
│           │       │   │   │   ├── loss.py
│           │       │   │   │   ├── protein.py
│           │       │   │   │   ├── residue_constants.py
│           │       │   │   │   ├── rigid_utils.py
│           │       │   │   │   └── tensor_utils.py
│           │       │   │   └── tokenization_esm.py
│           │       │   ├── flaubert/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_flaubert.py
│           │       │   │   ├── modeling_flaubert.py
│           │       │   │   ├── modeling_tf_flaubert.py
│           │       │   │   └── tokenization_flaubert.py
│           │       │   ├── flava/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_flava.py
│           │       │   │   ├── convert_dalle_to_flava_codebook.py
│           │       │   │   ├── convert_flava_original_pytorch_to_hf.py
│           │       │   │   ├── feature_extraction_flava.py
│           │       │   │   ├── modeling_flava.py
│           │       │   │   └── processing_flava.py
│           │       │   ├── fnet/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_fnet.py
│           │       │   │   ├── convert_fnet_original_flax_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_fnet.py
│           │       │   │   ├── tokenization_fnet.py
│           │       │   │   └── tokenization_fnet_fast.py
│           │       │   ├── fsmt/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_fsmt.py
│           │       │   │   ├── convert_fsmt_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_fsmt.py
│           │       │   │   └── tokenization_fsmt.py
│           │       │   ├── funnel/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_funnel.py
│           │       │   │   ├── convert_funnel_original_tf_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_funnel.py
│           │       │   │   ├── modeling_tf_funnel.py
│           │       │   │   ├── tokenization_funnel.py
│           │       │   │   └── tokenization_funnel_fast.py
│           │       │   ├── glpn/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_glpn.py
│           │       │   │   ├── convert_glpn_to_pytorch.py
│           │       │   │   ├── feature_extraction_glpn.py
│           │       │   │   ├── image_processing_glpn.py
│           │       │   │   └── modeling_glpn.py
│           │       │   ├── gpt2/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_gpt2.py
│           │       │   │   ├── convert_gpt2_original_tf_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_flax_gpt2.py
│           │       │   │   ├── modeling_gpt2.py
│           │       │   │   ├── modeling_tf_gpt2.py
│           │       │   │   ├── tokenization_gpt2.py
│           │       │   │   └── tokenization_gpt2_fast.py
│           │       │   ├── gpt_neo/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_gpt_neo.py
│           │       │   │   ├── convert_gpt_neo_mesh_tf_to_pytorch.py
│           │       │   │   ├── modeling_flax_gpt_neo.py
│           │       │   │   └── modeling_gpt_neo.py
│           │       │   ├── gpt_neox/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_gpt_neox.py
│           │       │   │   ├── modeling_gpt_neox.py
│           │       │   │   └── tokenization_gpt_neox_fast.py
│           │       │   ├── gpt_neox_japanese/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_gpt_neox_japanese.py
│           │       │   │   ├── modeling_gpt_neox_japanese.py
│           │       │   │   └── tokenization_gpt_neox_japanese.py
│           │       │   ├── gptj/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_gptj.py
│           │       │   │   ├── modeling_flax_gptj.py
│           │       │   │   ├── modeling_gptj.py
│           │       │   │   └── modeling_tf_gptj.py
│           │       │   ├── groupvit/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_groupvit.py
│           │       │   │   ├── convert_groupvit_nvlab_to_hf.py
│           │       │   │   ├── modeling_groupvit.py
│           │       │   │   └── modeling_tf_groupvit.py
│           │       │   ├── herbert/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── tokenization_herbert.py
│           │       │   │   └── tokenization_herbert_fast.py
│           │       │   ├── hubert/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_hubert.py
│           │       │   │   ├── convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py
│           │       │   │   ├── convert_hubert_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   ├── convert_hubert_original_s3prl_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_hubert.py
│           │       │   │   └── modeling_tf_hubert.py
│           │       │   ├── ibert/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_ibert.py
│           │       │   │   ├── modeling_ibert.py
│           │       │   │   └── quant_modules.py
│           │       │   ├── imagegpt/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_imagegpt.py
│           │       │   │   ├── convert_imagegpt_original_tf2_to_pytorch.py
│           │       │   │   ├── feature_extraction_imagegpt.py
│           │       │   │   └── modeling_imagegpt.py
│           │       │   ├── layoutlm/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_layoutlm.py
│           │       │   │   ├── modeling_layoutlm.py
│           │       │   │   ├── modeling_tf_layoutlm.py
│           │       │   │   ├── tokenization_layoutlm.py
│           │       │   │   └── tokenization_layoutlm_fast.py
│           │       │   ├── layoutlmv2/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_layoutlmv2.py
│           │       │   │   ├── feature_extraction_layoutlmv2.py
│           │       │   │   ├── modeling_layoutlmv2.py
│           │       │   │   ├── processing_layoutlmv2.py
│           │       │   │   ├── tokenization_layoutlmv2.py
│           │       │   │   └── tokenization_layoutlmv2_fast.py
│           │       │   ├── layoutlmv3/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_layoutlmv3.py
│           │       │   │   ├── feature_extraction_layoutlmv3.py
│           │       │   │   ├── modeling_layoutlmv3.py
│           │       │   │   ├── modeling_tf_layoutlmv3.py
│           │       │   │   ├── processing_layoutlmv3.py
│           │       │   │   ├── tokenization_layoutlmv3.py
│           │       │   │   └── tokenization_layoutlmv3_fast.py
│           │       │   ├── layoutxlm/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── processing_layoutxlm.py
│           │       │   │   ├── tokenization_layoutxlm.py
│           │       │   │   └── tokenization_layoutxlm_fast.py
│           │       │   ├── led/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_led.py
│           │       │   │   ├── modeling_led.py
│           │       │   │   ├── modeling_tf_led.py
│           │       │   │   ├── tokenization_led.py
│           │       │   │   └── tokenization_led_fast.py
│           │       │   ├── levit/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_levit.py
│           │       │   │   ├── convert_levit_timm_to_pytorch.py
│           │       │   │   ├── feature_extraction_levit.py
│           │       │   │   └── modeling_levit.py
│           │       │   ├── lilt/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_lilt.py
│           │       │   │   └── modeling_lilt.py
│           │       │   ├── longformer/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_longformer.py
│           │       │   │   ├── convert_longformer_original_pytorch_lightning_to_pytorch.py
│           │       │   │   ├── modeling_longformer.py
│           │       │   │   ├── modeling_tf_longformer.py
│           │       │   │   ├── tokenization_longformer.py
│           │       │   │   └── tokenization_longformer_fast.py
│           │       │   ├── longt5/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_longt5.py
│           │       │   │   ├── convert_longt5x_checkpoint_to_flax.py
│           │       │   │   ├── modeling_flax_longt5.py
│           │       │   │   └── modeling_longt5.py
│           │       │   ├── luke/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_luke.py
│           │       │   │   ├── convert_luke_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_luke.py
│           │       │   │   └── tokenization_luke.py
│           │       │   ├── lxmert/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_lxmert.py
│           │       │   │   ├── convert_lxmert_original_tf_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_lxmert.py
│           │       │   │   ├── modeling_tf_lxmert.py
│           │       │   │   ├── tokenization_lxmert.py
│           │       │   │   └── tokenization_lxmert_fast.py
│           │       │   ├── m2m_100/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_m2m_100.py
│           │       │   │   ├── convert_m2m100_original_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_m2m_100.py
│           │       │   │   └── tokenization_m2m_100.py
│           │       │   ├── marian/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_marian.py
│           │       │   │   ├── convert_marian_tatoeba_to_pytorch.py
│           │       │   │   ├── convert_marian_to_pytorch.py
│           │       │   │   ├── modeling_flax_marian.py
│           │       │   │   ├── modeling_marian.py
│           │       │   │   ├── modeling_tf_marian.py
│           │       │   │   └── tokenization_marian.py
│           │       │   ├── markuplm/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_markuplm.py
│           │       │   │   ├── feature_extraction_markuplm.py
│           │       │   │   ├── modeling_markuplm.py
│           │       │   │   ├── processing_markuplm.py
│           │       │   │   ├── tokenization_markuplm.py
│           │       │   │   └── tokenization_markuplm_fast.py
│           │       │   ├── maskformer/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_maskformer.py
│           │       │   │   ├── convert_maskformer_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   ├── feature_extraction_maskformer.py
│           │       │   │   └── modeling_maskformer.py
│           │       │   ├── mbart/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_mbart.py
│           │       │   │   ├── convert_mbart_original_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_flax_mbart.py
│           │       │   │   ├── modeling_mbart.py
│           │       │   │   ├── modeling_tf_mbart.py
│           │       │   │   ├── tokenization_mbart.py
│           │       │   │   └── tokenization_mbart_fast.py
│           │       │   ├── mbart50/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── tokenization_mbart50.py
│           │       │   │   └── tokenization_mbart50_fast.py
│           │       │   ├── mctct/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_mctct.py
│           │       │   │   ├── feature_extraction_mctct.py
│           │       │   │   ├── modeling_mctct.py
│           │       │   │   └── processing_mctct.py
│           │       │   ├── megatron_bert/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_megatron_bert.py
│           │       │   │   ├── convert_megatron_bert_checkpoint.py
│           │       │   │   └── modeling_megatron_bert.py
│           │       │   ├── megatron_gpt2/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── checkpoint_reshaping_and_interoperability.py
│           │       │   │   └── convert_megatron_gpt2_checkpoint.py
│           │       │   ├── mluke/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── convert_mluke_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   └── tokenization_mluke.py
│           │       │   ├── mmbt/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_mmbt.py
│           │       │   │   └── modeling_mmbt.py
│           │       │   ├── mobilebert/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_mobilebert.py
│           │       │   │   ├── convert_mobilebert_original_tf_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_mobilebert.py
│           │       │   │   ├── modeling_tf_mobilebert.py
│           │       │   │   ├── tokenization_mobilebert.py
│           │       │   │   └── tokenization_mobilebert_fast.py
│           │       │   ├── mobilevit/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_mobilevit.py
│           │       │   │   ├── convert_mlcvnets_to_pytorch.py
│           │       │   │   ├── feature_extraction_mobilevit.py
│           │       │   │   ├── modeling_mobilevit.py
│           │       │   │   └── modeling_tf_mobilevit.py
│           │       │   ├── mpnet/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_mpnet.py
│           │       │   │   ├── modeling_mpnet.py
│           │       │   │   ├── modeling_tf_mpnet.py
│           │       │   │   ├── tokenization_mpnet.py
│           │       │   │   └── tokenization_mpnet_fast.py
│           │       │   ├── mt5/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_mt5.py
│           │       │   │   ├── modeling_flax_mt5.py
│           │       │   │   ├── modeling_mt5.py
│           │       │   │   └── modeling_tf_mt5.py
│           │       │   ├── mvp/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_mvp.py
│           │       │   │   ├── modeling_mvp.py
│           │       │   │   ├── tokenization_mvp.py
│           │       │   │   └── tokenization_mvp_fast.py
│           │       │   ├── nezha/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_nezha.py
│           │       │   │   └── modeling_nezha.py
│           │       │   ├── nllb/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── tokenization_nllb.py
│           │       │   │   └── tokenization_nllb_fast.py
│           │       │   ├── nystromformer/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_nystromformer.py
│           │       │   │   ├── convert_nystromformer_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   └── modeling_nystromformer.py
│           │       │   ├── openai/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_openai.py
│           │       │   │   ├── convert_openai_original_tf_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_openai.py
│           │       │   │   ├── modeling_tf_openai.py
│           │       │   │   ├── tokenization_openai.py
│           │       │   │   └── tokenization_openai_fast.py
│           │       │   ├── opt/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_opt.py
│           │       │   │   ├── convert_opt_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_flax_opt.py
│           │       │   │   ├── modeling_opt.py
│           │       │   │   └── modeling_tf_opt.py
│           │       │   ├── owlvit/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_owlvit.py
│           │       │   │   ├── convert_owlvit_original_flax_to_hf.py
│           │       │   │   ├── feature_extraction_owlvit.py
│           │       │   │   ├── modeling_owlvit.py
│           │       │   │   └── processing_owlvit.py
│           │       │   ├── pegasus/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_pegasus.py
│           │       │   │   ├── convert_pegasus_tf_to_pytorch.py
│           │       │   │   ├── modeling_flax_pegasus.py
│           │       │   │   ├── modeling_pegasus.py
│           │       │   │   ├── modeling_tf_pegasus.py
│           │       │   │   ├── tokenization_pegasus.py
│           │       │   │   └── tokenization_pegasus_fast.py
│           │       │   ├── pegasus_x/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_pegasus_x.py
│           │       │   │   └── modeling_pegasus_x.py
│           │       │   ├── perceiver/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_perceiver.py
│           │       │   │   ├── convert_perceiver_haiku_to_pytorch.py
│           │       │   │   ├── feature_extraction_perceiver.py
│           │       │   │   ├── modeling_perceiver.py
│           │       │   │   └── tokenization_perceiver.py
│           │       │   ├── phobert/
│           │       │   │   ├── __init__.py
│           │       │   │   └── tokenization_phobert.py
│           │       │   ├── plbart/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_plbart.py
│           │       │   │   ├── convert_plbart_original_checkpoint_to_torch.py
│           │       │   │   ├── modeling_plbart.py
│           │       │   │   └── tokenization_plbart.py
│           │       │   ├── poolformer/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_poolformer.py
│           │       │   │   ├── convert_poolformer_original_to_pytorch.py
│           │       │   │   ├── feature_extraction_poolformer.py
│           │       │   │   └── modeling_poolformer.py
│           │       │   ├── prophetnet/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_prophetnet.py
│           │       │   │   ├── convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_prophetnet.py
│           │       │   │   └── tokenization_prophetnet.py
│           │       │   ├── qdqbert/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_qdqbert.py
│           │       │   │   └── modeling_qdqbert.py
│           │       │   ├── rag/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_rag.py
│           │       │   │   ├── modeling_rag.py
│           │       │   │   ├── modeling_tf_rag.py
│           │       │   │   ├── retrieval_rag.py
│           │       │   │   └── tokenization_rag.py
│           │       │   ├── realm/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_realm.py
│           │       │   │   ├── modeling_realm.py
│           │       │   │   ├── retrieval_realm.py
│           │       │   │   ├── tokenization_realm.py
│           │       │   │   └── tokenization_realm_fast.py
│           │       │   ├── reformer/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_reformer.py
│           │       │   │   ├── convert_reformer_trax_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_reformer.py
│           │       │   │   ├── tokenization_reformer.py
│           │       │   │   └── tokenization_reformer_fast.py
│           │       │   ├── regnet/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_regnet.py
│           │       │   │   ├── convert_regnet_seer_10b_to_pytorch.py
│           │       │   │   ├── convert_regnet_to_pytorch.py
│           │       │   │   ├── modeling_regnet.py
│           │       │   │   └── modeling_tf_regnet.py
│           │       │   ├── rembert/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_rembert.py
│           │       │   │   ├── convert_rembert_tf_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_rembert.py
│           │       │   │   ├── modeling_tf_rembert.py
│           │       │   │   ├── tokenization_rembert.py
│           │       │   │   └── tokenization_rembert_fast.py
│           │       │   ├── resnet/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_resnet.py
│           │       │   │   ├── convert_resnet_to_pytorch.py
│           │       │   │   ├── modeling_resnet.py
│           │       │   │   └── modeling_tf_resnet.py
│           │       │   ├── retribert/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_retribert.py
│           │       │   │   ├── modeling_retribert.py
│           │       │   │   ├── tokenization_retribert.py
│           │       │   │   └── tokenization_retribert_fast.py
│           │       │   ├── roberta/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_roberta.py
│           │       │   │   ├── convert_roberta_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_flax_roberta.py
│           │       │   │   ├── modeling_roberta.py
│           │       │   │   ├── modeling_tf_roberta.py
│           │       │   │   ├── tokenization_roberta.py
│           │       │   │   └── tokenization_roberta_fast.py
│           │       │   ├── roformer/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_roformer.py
│           │       │   │   ├── convert_roformer_original_tf_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_flax_roformer.py
│           │       │   │   ├── modeling_roformer.py
│           │       │   │   ├── modeling_tf_roformer.py
│           │       │   │   ├── tokenization_roformer.py
│           │       │   │   ├── tokenization_roformer_fast.py
│           │       │   │   └── tokenization_utils.py
│           │       │   ├── segformer/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_segformer.py
│           │       │   │   ├── convert_segformer_original_to_pytorch.py
│           │       │   │   ├── feature_extraction_segformer.py
│           │       │   │   ├── modeling_segformer.py
│           │       │   │   └── modeling_tf_segformer.py
│           │       │   ├── sew/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_sew.py
│           │       │   │   ├── convert_sew_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   └── modeling_sew.py
│           │       │   ├── sew_d/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_sew_d.py
│           │       │   │   ├── convert_sew_d_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   └── modeling_sew_d.py
│           │       │   ├── speech_encoder_decoder/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_speech_encoder_decoder.py
│           │       │   │   ├── convert_mbart_wav2vec2_seq2seq_original_to_pytorch.py
│           │       │   │   ├── convert_speech_to_text_wav2vec2_seq2seq_original_to_pytorch.py
│           │       │   │   ├── modeling_flax_speech_encoder_decoder.py
│           │       │   │   └── modeling_speech_encoder_decoder.py
│           │       │   ├── speech_to_text/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_speech_to_text.py
│           │       │   │   ├── convert_s2t_fairseq_to_tfms.py
│           │       │   │   ├── feature_extraction_speech_to_text.py
│           │       │   │   ├── modeling_speech_to_text.py
│           │       │   │   ├── modeling_tf_speech_to_text.py
│           │       │   │   ├── processing_speech_to_text.py
│           │       │   │   └── tokenization_speech_to_text.py
│           │       │   ├── speech_to_text_2/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_speech_to_text_2.py
│           │       │   │   ├── modeling_speech_to_text_2.py
│           │       │   │   ├── processing_speech_to_text_2.py
│           │       │   │   └── tokenization_speech_to_text_2.py
│           │       │   ├── splinter/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_splinter.py
│           │       │   │   ├── modeling_splinter.py
│           │       │   │   ├── tokenization_splinter.py
│           │       │   │   └── tokenization_splinter_fast.py
│           │       │   ├── squeezebert/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_squeezebert.py
│           │       │   │   ├── modeling_squeezebert.py
│           │       │   │   ├── tokenization_squeezebert.py
│           │       │   │   └── tokenization_squeezebert_fast.py
│           │       │   ├── swin/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_swin.py
│           │       │   │   ├── convert_swin_timm_to_pytorch.py
│           │       │   │   ├── modeling_swin.py
│           │       │   │   └── modeling_tf_swin.py
│           │       │   ├── swinv2/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_swinv2.py
│           │       │   │   ├── convert_swinv2_timm_to_pytorch.py
│           │       │   │   └── modeling_swinv2.py
│           │       │   ├── t5/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_t5.py
│           │       │   │   ├── convert_t5_original_tf_checkpoint_to_pytorch.py
│           │       │   │   ├── convert_t5x_checkpoint_to_flax.py
│           │       │   │   ├── download_from_gcp.sh
│           │       │   │   ├── modeling_flax_t5.py
│           │       │   │   ├── modeling_t5.py
│           │       │   │   ├── modeling_tf_t5.py
│           │       │   │   ├── tokenization_t5.py
│           │       │   │   └── tokenization_t5_fast.py
│           │       │   ├── table_transformer/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_table_transformer.py
│           │       │   │   ├── convert_table_transformer_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   └── modeling_table_transformer.py
│           │       │   ├── tapas/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_tapas.py
│           │       │   │   ├── convert_tapas_original_tf_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_tapas.py
│           │       │   │   ├── modeling_tf_tapas.py
│           │       │   │   └── tokenization_tapas.py
│           │       │   ├── tapex/
│           │       │   │   ├── __init__.py
│           │       │   │   └── tokenization_tapex.py
│           │       │   ├── time_series_transformer/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_time_series_transformer.py
│           │       │   │   └── modeling_time_series_transformer.py
│           │       │   ├── trajectory_transformer/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_trajectory_transformer.py
│           │       │   │   ├── convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   └── modeling_trajectory_transformer.py
│           │       │   ├── transfo_xl/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_transfo_xl.py
│           │       │   │   ├── convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_tf_transfo_xl.py
│           │       │   │   ├── modeling_tf_transfo_xl_utilities.py
│           │       │   │   ├── modeling_transfo_xl.py
│           │       │   │   ├── modeling_transfo_xl_utilities.py
│           │       │   │   └── tokenization_transfo_xl.py
│           │       │   ├── trocr/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_trocr.py
│           │       │   │   ├── convert_trocr_unilm_to_pytorch.py
│           │       │   │   ├── modeling_trocr.py
│           │       │   │   └── processing_trocr.py
│           │       │   ├── unispeech/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_unispeech.py
│           │       │   │   ├── convert_unispeech_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   └── modeling_unispeech.py
│           │       │   ├── unispeech_sat/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_unispeech_sat.py
│           │       │   │   ├── convert_unispeech_original_s3prl_checkpoint_to_pytorch.py
│           │       │   │   ├── convert_unispeech_sat_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   └── modeling_unispeech_sat.py
│           │       │   ├── van/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_van.py
│           │       │   │   ├── convert_van_to_pytorch.py
│           │       │   │   └── modeling_van.py
│           │       │   ├── videomae/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_videomae.py
│           │       │   │   ├── convert_videomae_to_pytorch.py
│           │       │   │   ├── feature_extraction_videomae.py
│           │       │   │   └── modeling_videomae.py
│           │       │   ├── vilt/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_vilt.py
│           │       │   │   ├── convert_vilt_original_to_pytorch.py
│           │       │   │   ├── feature_extraction_vilt.py
│           │       │   │   ├── modeling_vilt.py
│           │       │   │   └── processing_vilt.py
│           │       │   ├── vision_encoder_decoder/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_vision_encoder_decoder.py
│           │       │   │   ├── modeling_flax_vision_encoder_decoder.py
│           │       │   │   ├── modeling_tf_vision_encoder_decoder.py
│           │       │   │   └── modeling_vision_encoder_decoder.py
│           │       │   ├── vision_text_dual_encoder/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_vision_text_dual_encoder.py
│           │       │   │   ├── modeling_flax_vision_text_dual_encoder.py
│           │       │   │   ├── modeling_vision_text_dual_encoder.py
│           │       │   │   └── processing_vision_text_dual_encoder.py
│           │       │   ├── visual_bert/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_visual_bert.py
│           │       │   │   ├── convert_visual_bert_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   └── modeling_visual_bert.py
│           │       │   ├── vit/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_vit.py
│           │       │   │   ├── convert_dino_to_pytorch.py
│           │       │   │   ├── convert_vit_timm_to_pytorch.py
│           │       │   │   ├── feature_extraction_vit.py
│           │       │   │   ├── modeling_flax_vit.py
│           │       │   │   ├── modeling_tf_vit.py
│           │       │   │   └── modeling_vit.py
│           │       │   ├── vit_mae/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_vit_mae.py
│           │       │   │   ├── convert_vit_mae_to_pytorch.py
│           │       │   │   ├── modeling_tf_vit_mae.py
│           │       │   │   └── modeling_vit_mae.py
│           │       │   ├── vit_msn/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_vit_msn.py
│           │       │   │   ├── convert_msn_to_pytorch.py
│           │       │   │   └── modeling_vit_msn.py
│           │       │   ├── wav2vec2/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_wav2vec2.py
│           │       │   │   ├── convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   ├── convert_wav2vec2_original_s3prl_checkpoint_to_pytorch.py
│           │       │   │   ├── feature_extraction_wav2vec2.py
│           │       │   │   ├── modeling_flax_wav2vec2.py
│           │       │   │   ├── modeling_tf_wav2vec2.py
│           │       │   │   ├── modeling_wav2vec2.py
│           │       │   │   ├── processing_wav2vec2.py
│           │       │   │   └── tokenization_wav2vec2.py
│           │       │   ├── wav2vec2_conformer/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_wav2vec2_conformer.py
│           │       │   │   ├── convert_wav2vec2_conformer_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   └── modeling_wav2vec2_conformer.py
│           │       │   ├── wav2vec2_phoneme/
│           │       │   │   ├── __init__.py
│           │       │   │   └── tokenization_wav2vec2_phoneme.py
│           │       │   ├── wav2vec2_with_lm/
│           │       │   │   ├── __init__.py
│           │       │   │   └── processing_wav2vec2_with_lm.py
│           │       │   ├── wavlm/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_wavlm.py
│           │       │   │   ├── convert_wavlm_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   ├── convert_wavlm_original_s3prl_checkpoint_to_pytorch.py
│           │       │   │   └── modeling_wavlm.py
│           │       │   ├── whisper/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_whisper.py
│           │       │   │   ├── english_normalizer.py
│           │       │   │   ├── feature_extraction_whisper.py
│           │       │   │   ├── modeling_tf_whisper.py
│           │       │   │   ├── modeling_whisper.py
│           │       │   │   ├── processing_whisper.py
│           │       │   │   └── tokenization_whisper.py
│           │       │   ├── x_clip/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_x_clip.py
│           │       │   │   ├── convert_x_clip_original_pytorch_to_hf.py
│           │       │   │   ├── modeling_x_clip.py
│           │       │   │   └── processing_x_clip.py
│           │       │   ├── xglm/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_xglm.py
│           │       │   │   ├── convert_xglm_original_ckpt_to_trfms.py
│           │       │   │   ├── modeling_flax_xglm.py
│           │       │   │   ├── modeling_tf_xglm.py
│           │       │   │   ├── modeling_xglm.py
│           │       │   │   ├── tokenization_xglm.py
│           │       │   │   └── tokenization_xglm_fast.py
│           │       │   ├── xlm/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_xlm.py
│           │       │   │   ├── convert_xlm_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_tf_xlm.py
│           │       │   │   ├── modeling_xlm.py
│           │       │   │   └── tokenization_xlm.py
│           │       │   ├── xlm_prophetnet/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_xlm_prophetnet.py
│           │       │   │   ├── modeling_xlm_prophetnet.py
│           │       │   │   └── tokenization_xlm_prophetnet.py
│           │       │   ├── xlm_roberta/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_xlm_roberta.py
│           │       │   │   ├── modeling_flax_xlm_roberta.py
│           │       │   │   ├── modeling_tf_xlm_roberta.py
│           │       │   │   ├── modeling_xlm_roberta.py
│           │       │   │   ├── tokenization_xlm_roberta.py
│           │       │   │   └── tokenization_xlm_roberta_fast.py
│           │       │   ├── xlm_roberta_xl/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_xlm_roberta_xl.py
│           │       │   │   ├── convert_xlm_roberta_xl_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   └── modeling_xlm_roberta_xl.py
│           │       │   ├── xlnet/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_xlnet.py
│           │       │   │   ├── convert_xlnet_original_tf_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_tf_xlnet.py
│           │       │   │   ├── modeling_xlnet.py
│           │       │   │   ├── tokenization_xlnet.py
│           │       │   │   └── tokenization_xlnet_fast.py
│           │       │   ├── yolos/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_yolos.py
│           │       │   │   ├── convert_yolos_to_pytorch.py
│           │       │   │   ├── feature_extraction_yolos.py
│           │       │   │   └── modeling_yolos.py
│           │       │   └── yoso/
│           │       │       ├── __init__.py
│           │       │       ├── common.h
│           │       │       ├── common_cuda.h
│           │       │       ├── common_cuda_device.h
│           │       │       ├── configuration_yoso.py
│           │       │       ├── convert_yoso_pytorch_to_pytorch.py
│           │       │       ├── fast_lsh_cumulation.cu
│           │       │       ├── fast_lsh_cumulation.h
│           │       │       ├── fast_lsh_cumulation_cuda.cu
│           │       │       ├── fast_lsh_cumulation_cuda.h
│           │       │       ├── fast_lsh_cumulation_torch.cpp
│           │       │       └── modeling_yoso.py
│           │       ├── onnx/
│           │       │   ├── __init__.py
│           │       │   ├── __main__.py
│           │       │   ├── config.py
│           │       │   ├── convert.py
│           │       │   ├── features.py
│           │       │   └── utils.py
│           │       ├── optimization.py
│           │       ├── optimization_tf.py
│           │       ├── pipelines/
│           │       │   ├── __init__.py
│           │       │   ├── audio_classification.py
│           │       │   ├── audio_utils.py
│           │       │   ├── automatic_speech_recognition.py
│           │       │   ├── base.py
│           │       │   ├── conversational.py
│           │       │   ├── depth_estimation.py
│           │       │   ├── document_question_answering.py
│           │       │   ├── feature_extraction.py
│           │       │   ├── fill_mask.py
│           │       │   ├── image_classification.py
│           │       │   ├── image_segmentation.py
│           │       │   ├── image_to_text.py
│           │       │   ├── object_detection.py
│           │       │   ├── pt_utils.py
│           │       │   ├── question_answering.py
│           │       │   ├── table_question_answering.py
│           │       │   ├── text2text_generation.py
│           │       │   ├── text_classification.py
│           │       │   ├── text_generation.py
│           │       │   ├── token_classification.py
│           │       │   ├── visual_question_answering.py
│           │       │   ├── zero_shot_classification.py
│           │       │   ├── zero_shot_image_classification.py
│           │       │   └── zero_shot_object_detection.py
│           │       ├── processing_utils.py
│           │       ├── pytorch_utils.py
│           │       ├── sagemaker/
│           │       │   ├── __init__.py
│           │       │   ├── trainer_sm.py
│           │       │   └── training_args_sm.py
│           │       ├── testing_utils.py
│           │       ├── tf_utils.py
│           │       ├── tokenization_utils.py
│           │       ├── tokenization_utils_base.py
│           │       ├── tokenization_utils_fast.py
│           │       ├── trainer.py
│           │       ├── trainer_callback.py
│           │       ├── trainer_pt_utils.py
│           │       ├── trainer_seq2seq.py
│           │       ├── trainer_tf.py
│           │       ├── trainer_utils.py
│           │       ├── training_args.py
│           │       ├── training_args_seq2seq.py
│           │       ├── training_args_tf.py
│           │       └── utils/
│           │           ├── __init__.py
│           │           ├── bitsandbytes.py
│           │           ├── constants.py
│           │           ├── doc.py
│           │           ├── dummy_detectron2_objects.py
│           │           ├── dummy_flax_objects.py
│           │           ├── dummy_pt_objects.py
│           │           ├── dummy_scatter_objects.py
│           │           ├── dummy_sentencepiece_and_speech_objects.py
│           │           ├── dummy_sentencepiece_and_tokenizers_objects.py
│           │           ├── dummy_sentencepiece_objects.py
│           │           ├── dummy_speech_objects.py
│           │           ├── dummy_tensorflow_text_objects.py
│           │           ├── dummy_tf_objects.py
│           │           ├── dummy_timm_and_vision_objects.py
│           │           ├── dummy_tokenizers_objects.py
│           │           ├── dummy_vision_objects.py
│           │           ├── fx.py
│           │           ├── generic.py
│           │           ├── hp_naming.py
│           │           ├── hub.py
│           │           ├── import_utils.py
│           │           ├── logging.py
│           │           ├── model_parallel_utils.py
│           │           ├── notebook.py
│           │           ├── sentencepiece_model_pb2.py
│           │           └── versions.py
│           ├── templates/
│           │   ├── adding_a_missing_tokenization_test/
│           │   │   ├── README.md
│           │   │   ├── cookiecutter-template-{{cookiecutter.modelname}}/
│           │   │   │   └── test_tokenization_{{cookiecutter.lowercase_modelname}}.py
│           │   │   └── cookiecutter.json
│           │   ├── adding_a_new_example_script/
│           │   │   ├── README.md
│           │   │   ├── cookiecutter.json
│           │   │   └── {{cookiecutter.directory_name}}/
│           │   │       └── run_{{cookiecutter.example_shortcut}}.py
│           │   └── adding_a_new_model/
│           │       ├── ADD_NEW_MODEL_PROPOSAL_TEMPLATE.md
│           │       ├── README.md
│           │       ├── cookiecutter-template-{{cookiecutter.modelname}}/
│           │       │   ├── __init__.py
│           │       │   ├── configuration.json
│           │       │   ├── configuration_{{cookiecutter.lowercase_modelname}}.py
│           │       │   ├── modeling_flax_{{cookiecutter.lowercase_modelname}}.py
│           │       │   ├── modeling_tf_{{cookiecutter.lowercase_modelname}}.py
│           │       │   ├── modeling_{{cookiecutter.lowercase_modelname}}.py
│           │       │   ├── test_modeling_flax_{{cookiecutter.lowercase_modelname}}.py
│           │       │   ├── test_modeling_tf_{{cookiecutter.lowercase_modelname}}.py
│           │       │   ├── test_modeling_{{cookiecutter.lowercase_modelname}}.py
│           │       │   ├── to_replace_{{cookiecutter.lowercase_modelname}}.py
│           │       │   ├── tokenization_fast_{{cookiecutter.lowercase_modelname}}.py
│           │       │   ├── tokenization_{{cookiecutter.lowercase_modelname}}.py
│           │       │   └── {{cookiecutter.lowercase_modelname}}.mdx
│           │       ├── cookiecutter.json
│           │       ├── open_model_proposals/
│           │       │   ├── ADD_BIG_BIRD.md
│           │       │   └── README.md
│           │       └── tests/
│           │           ├── encoder-bert-tokenizer.json
│           │           ├── flax-encoder-bert-tokenizer.json
│           │           ├── flax-seq-2-seq-bart-tokenizer.json
│           │           ├── pt-encoder-bert-tokenizer.json
│           │           ├── pt-seq-2-seq-bart-tokenizer.json
│           │           ├── standalone.json
│           │           ├── tf-encoder-bert-tokenizer.json
│           │           └── tf-seq-2-seq-bart-tokenizer.json
│           ├── tests/
│           │   ├── __init__.py
│           │   ├── benchmark/
│           │   │   ├── __init__.py
│           │   │   ├── test_benchmark.py
│           │   │   └── test_benchmark_tf.py
│           │   ├── deepspeed/
│           │   │   ├── ds_config_zero2.json
│           │   │   ├── ds_config_zero3.json
│           │   │   ├── test_deepspeed.py
│           │   │   ├── test_model_zoo.py
│           │   │   └── vit_feature_extractor.json
│           │   ├── extended/
│           │   │   └── test_trainer_ext.py
│           │   ├── fixtures/
│           │   │   ├── add_distilbert_like_config.json
│           │   │   ├── dummy-config.json
│           │   │   ├── dummy_feature_extractor_config.json
│           │   │   ├── empty.txt
│           │   │   ├── input.txt
│           │   │   ├── merges.txt
│           │   │   ├── preprocessor_config.json
│           │   │   ├── sample_text.txt
│           │   │   ├── sample_text_no_unicode.txt
│           │   │   ├── spiece.model
│           │   │   ├── test_entity_vocab.json
│           │   │   ├── test_sentencepiece.model
│           │   │   ├── test_sentencepiece_bpe.model
│           │   │   ├── test_sentencepiece_no_bos.model
│           │   │   ├── test_sentencepiece_with_bytefallback.model
│           │   │   ├── tests_samples/
│           │   │   │   ├── .gitignore
│           │   │   │   ├── COCO/
│           │   │   │   │   ├── coco_annotations.txt
│           │   │   │   │   └── coco_panoptic_annotations.txt
│           │   │   │   ├── GermEval/
│           │   │   │   │   ├── dev.txt
│           │   │   │   │   ├── labels.txt
│           │   │   │   │   └── train.txt
│           │   │   │   ├── MRPC/
│           │   │   │   │   ├── dev.csv
│           │   │   │   │   ├── dev.tsv
│           │   │   │   │   ├── train.csv
│           │   │   │   │   └── train.tsv
│           │   │   │   ├── SQUAD/
│           │   │   │   │   └── sample.json
│           │   │   │   ├── STS-B/
│           │   │   │   │   ├── dev.tsv
│           │   │   │   │   └── train.tsv
│           │   │   │   ├── conll/
│           │   │   │   │   └── sample.json
│           │   │   │   ├── swag/
│           │   │   │   │   └── sample.json
│           │   │   │   ├── wiki_text/
│           │   │   │   │   └── wiki_00
│           │   │   │   ├── wmt16/
│           │   │   │   │   └── sample.json
│           │   │   │   ├── wmt_en_ro/
│           │   │   │   │   ├── test.json
│           │   │   │   │   ├── train.json
│           │   │   │   │   └── val.json
│           │   │   │   └── xsum/
│           │   │   │       └── sample.json
│           │   │   ├── vocab.json
│           │   │   └── vocab.txt
│           │   ├── generation/
│           │   │   ├── __init__.py
│           │   │   ├── test_generation_beam_constraints.py
│           │   │   ├── test_generation_beam_search.py
│           │   │   ├── test_generation_flax_logits_process.py
│           │   │   ├── test_generation_flax_utils.py
│           │   │   ├── test_generation_logits_process.py
│           │   │   ├── test_generation_stopping_criteria.py
│           │   │   ├── test_generation_tf_logits_process.py
│           │   │   ├── test_generation_tf_utils.py
│           │   │   └── test_generation_utils.py
│           │   ├── mixed_int8/
│           │   │   ├── README.md
│           │   │   ├── __init__.py
│           │   │   └── test_mixed_int8.py
│           │   ├── models/
│           │   │   ├── __init__.py
│           │   │   ├── albert/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_albert.py
│           │   │   │   ├── test_modeling_flax_albert.py
│           │   │   │   ├── test_modeling_tf_albert.py
│           │   │   │   └── test_tokenization_albert.py
│           │   │   ├── auto/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_configuration_auto.py
│           │   │   │   ├── test_feature_extraction_auto.py
│           │   │   │   ├── test_modeling_auto.py
│           │   │   │   ├── test_modeling_flax_auto.py
│           │   │   │   ├── test_modeling_tf_auto.py
│           │   │   │   ├── test_modeling_tf_pytorch.py
│           │   │   │   ├── test_processor_auto.py
│           │   │   │   └── test_tokenization_auto.py
│           │   │   ├── bart/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_bart.py
│           │   │   │   ├── test_modeling_flax_bart.py
│           │   │   │   ├── test_modeling_tf_bart.py
│           │   │   │   └── test_tokenization_bart.py
│           │   │   ├── barthez/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_tokenization_barthez.py
│           │   │   ├── bartpho/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_tokenization_bartpho.py
│           │   │   ├── beit/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_beit.py
│           │   │   │   ├── test_modeling_beit.py
│           │   │   │   └── test_modeling_flax_beit.py
│           │   │   ├── bert/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_bert.py
│           │   │   │   ├── test_modeling_flax_bert.py
│           │   │   │   ├── test_modeling_tf_bert.py
│           │   │   │   ├── test_tokenization_bert.py
│           │   │   │   └── test_tokenization_bert_tf.py
│           │   │   ├── bert_generation/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_bert_generation.py
│           │   │   │   └── test_tokenization_bert_generation.py
│           │   │   ├── bert_japanese/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_tokenization_bert_japanese.py
│           │   │   ├── bertweet/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_tokenization_bertweet.py
│           │   │   ├── big_bird/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_big_bird.py
│           │   │   │   ├── test_modeling_flax_big_bird.py
│           │   │   │   └── test_tokenization_big_bird.py
│           │   │   ├── bigbird_pegasus/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_bigbird_pegasus.py
│           │   │   ├── blenderbot/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_blenderbot.py
│           │   │   │   ├── test_modeling_flax_blenderbot.py
│           │   │   │   ├── test_modeling_tf_blenderbot.py
│           │   │   │   └── test_tokenization_blenderbot.py
│           │   │   ├── blenderbot_small/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_blenderbot_small.py
│           │   │   │   ├── test_modeling_flax_blenderbot_small.py
│           │   │   │   ├── test_modeling_tf_blenderbot_small.py
│           │   │   │   └── test_tokenization_blenderbot_small.py
│           │   │   ├── bloom/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_bloom.py
│           │   │   │   └── test_tokenization_bloom.py
│           │   │   ├── bort/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_bort.py
│           │   │   │   └── test_modeling_tf_bort.py
│           │   │   ├── byt5/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_tokenization_byt5.py
│           │   │   ├── camembert/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_camembert.py
│           │   │   │   ├── test_modeling_tf_camembert.py
│           │   │   │   └── test_tokenization_camembert.py
│           │   │   ├── canine/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_canine.py
│           │   │   │   └── test_tokenization_canine.py
│           │   │   ├── clip/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_clip.py
│           │   │   │   ├── test_modeling_clip.py
│           │   │   │   ├── test_modeling_flax_clip.py
│           │   │   │   ├── test_modeling_tf_clip.py
│           │   │   │   ├── test_processor_clip.py
│           │   │   │   └── test_tokenization_clip.py
│           │   │   ├── codegen/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_codegen.py
│           │   │   │   └── test_tokenization_codegen.py
│           │   │   ├── conditional_detr/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_conditional_detr.py
│           │   │   │   └── test_modeling_conditional_detr.py
│           │   │   ├── convbert/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_convbert.py
│           │   │   │   └── test_modeling_tf_convbert.py
│           │   │   ├── convnext/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_convnext.py
│           │   │   │   ├── test_modeling_convnext.py
│           │   │   │   └── test_modeling_tf_convnext.py
│           │   │   ├── cpm/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_tokenization_cpm.py
│           │   │   ├── ctrl/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_ctrl.py
│           │   │   │   ├── test_modeling_tf_ctrl.py
│           │   │   │   └── test_tokenization_ctrl.py
│           │   │   ├── cvt/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_cvt.py
│           │   │   │   └── test_modeling_tf_cvt.py
│           │   │   ├── data2vec/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_data2vec_audio.py
│           │   │   │   ├── test_modeling_data2vec_text.py
│           │   │   │   ├── test_modeling_data2vec_vision.py
│           │   │   │   └── test_modeling_tf_data2vec_vision.py
│           │   │   ├── deberta/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_deberta.py
│           │   │   │   ├── test_modeling_tf_deberta.py
│           │   │   │   └── test_tokenization_deberta.py
│           │   │   ├── deberta_v2/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_deberta_v2.py
│           │   │   │   ├── test_modeling_tf_deberta_v2.py
│           │   │   │   └── test_tokenization_deberta_v2.py
│           │   │   ├── decision_transformer/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_decision_transformer.py
│           │   │   ├── deformable_detr/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_deformable_detr.py
│           │   │   │   └── test_modeling_deformable_detr.py
│           │   │   ├── deit/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_deit.py
│           │   │   │   ├── test_modeling_deit.py
│           │   │   │   └── test_modeling_tf_deit.py
│           │   │   ├── detr/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_detr.py
│           │   │   │   └── test_modeling_detr.py
│           │   │   ├── distilbert/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_distilbert.py
│           │   │   │   ├── test_modeling_flax_distilbert.py
│           │   │   │   ├── test_modeling_tf_distilbert.py
│           │   │   │   └── test_tokenization_distilbert.py
│           │   │   ├── dit/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_dit.py
│           │   │   ├── donut/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_donut.py
│           │   │   │   └── test_modeling_donut_swin.py
│           │   │   ├── dpr/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_dpr.py
│           │   │   │   ├── test_modeling_tf_dpr.py
│           │   │   │   └── test_tokenization_dpr.py
│           │   │   ├── dpt/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_dpt.py
│           │   │   │   └── test_modeling_dpt.py
│           │   │   ├── electra/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_electra.py
│           │   │   │   ├── test_modeling_flax_electra.py
│           │   │   │   └── test_modeling_tf_electra.py
│           │   │   ├── encoder_decoder/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_encoder_decoder.py
│           │   │   │   ├── test_modeling_flax_encoder_decoder.py
│           │   │   │   └── test_modeling_tf_encoder_decoder.py
│           │   │   ├── ernie/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_ernie.py
│           │   │   ├── esm/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_esm.py
│           │   │   │   ├── test_modeling_esmfold.py
│           │   │   │   ├── test_modeling_tf_esm.py
│           │   │   │   └── test_tokenization_esm.py
│           │   │   ├── flaubert/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_flaubert.py
│           │   │   │   └── test_modeling_tf_flaubert.py
│           │   │   ├── flava/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_flava.py
│           │   │   │   ├── test_modeling_flava.py
│           │   │   │   └── test_processor_flava.py
│           │   │   ├── fnet/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_fnet.py
│           │   │   │   └── test_tokenization_fnet.py
│           │   │   ├── fsmt/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_fsmt.py
│           │   │   │   └── test_tokenization_fsmt.py
│           │   │   ├── funnel/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_funnel.py
│           │   │   │   ├── test_modeling_tf_funnel.py
│           │   │   │   └── test_tokenization_funnel.py
│           │   │   ├── glpn/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_glpn.py
│           │   │   │   └── test_modeling_glpn.py
│           │   │   ├── gpt2/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_flax_gpt2.py
│           │   │   │   ├── test_modeling_gpt2.py
│           │   │   │   ├── test_modeling_tf_gpt2.py
│           │   │   │   └── test_tokenization_gpt2.py
│           │   │   ├── gpt_neo/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_flax_gpt_neo.py
│           │   │   │   └── test_modeling_gpt_neo.py
│           │   │   ├── gpt_neox/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_gpt_neox.py
│           │   │   ├── gpt_neox_japanese/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_gpt_neox_japanese.py
│           │   │   │   └── test_tokenization_gpt_neox_japanese.py
│           │   │   ├── gptj/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_flax_gptj.py
│           │   │   │   ├── test_modeling_gptj.py
│           │   │   │   └── test_modeling_tf_gptj.py
│           │   │   ├── groupvit/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_groupvit.py
│           │   │   │   └── test_modeling_tf_groupvit.py
│           │   │   ├── herbert/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_tokenization_herbert.py
│           │   │   ├── hubert/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_hubert.py
│           │   │   │   └── test_modeling_tf_hubert.py
│           │   │   ├── ibert/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_ibert.py
│           │   │   ├── imagegpt/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_imagegpt.py
│           │   │   │   └── test_modeling_imagegpt.py
│           │   │   ├── layoutlm/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_layoutlm.py
│           │   │   │   ├── test_modeling_tf_layoutlm.py
│           │   │   │   └── test_tokenization_layoutlm.py
│           │   │   ├── layoutlmv2/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_layoutlmv2.py
│           │   │   │   ├── test_modeling_layoutlmv2.py
│           │   │   │   ├── test_processor_layoutlmv2.py
│           │   │   │   └── test_tokenization_layoutlmv2.py
│           │   │   ├── layoutlmv3/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_layoutlmv3.py
│           │   │   │   ├── test_modeling_layoutlmv3.py
│           │   │   │   ├── test_modeling_tf_layoutlmv3.py
│           │   │   │   ├── test_processor_layoutlmv3.py
│           │   │   │   └── test_tokenization_layoutlmv3.py
│           │   │   ├── layoutxlm/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_processor_layoutxlm.py
│           │   │   │   └── test_tokenization_layoutxlm.py
│           │   │   ├── led/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_led.py
│           │   │   │   └── test_modeling_tf_led.py
│           │   │   ├── levit/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_levit.py
│           │   │   │   └── test_modeling_levit.py
│           │   │   ├── lilt/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_lilt.py
│           │   │   ├── longformer/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_longformer.py
│           │   │   │   ├── test_modeling_tf_longformer.py
│           │   │   │   └── test_tokenization_longformer.py
│           │   │   ├── longt5/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_flax_longt5.py
│           │   │   │   └── test_modeling_longt5.py
│           │   │   ├── luke/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_luke.py
│           │   │   │   └── test_tokenization_luke.py
│           │   │   ├── lxmert/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_lxmert.py
│           │   │   │   ├── test_modeling_tf_lxmert.py
│           │   │   │   └── test_tokenization_lxmert.py
│           │   │   ├── m2m_100/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_m2m_100.py
│           │   │   │   └── test_tokenization_m2m_100.py
│           │   │   ├── marian/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_flax_marian.py
│           │   │   │   ├── test_modeling_marian.py
│           │   │   │   ├── test_modeling_tf_marian.py
│           │   │   │   └── test_tokenization_marian.py
│           │   │   ├── markuplm/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_markuplm.py
│           │   │   │   ├── test_modeling_markuplm.py
│           │   │   │   ├── test_processor_markuplm.py
│           │   │   │   └── test_tokenization_markuplm.py
│           │   │   ├── maskformer/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_maskformer.py
│           │   │   │   └── test_modeling_maskformer.py
│           │   │   ├── mbart/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_flax_mbart.py
│           │   │   │   ├── test_modeling_mbart.py
│           │   │   │   ├── test_modeling_tf_mbart.py
│           │   │   │   └── test_tokenization_mbart.py
│           │   │   ├── mbart50/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_tokenization_mbart50.py
│           │   │   ├── mctct/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_mctct.py
│           │   │   │   ├── test_modeling_mctct.py
│           │   │   │   └── test_processor_mctct.py
│           │   │   ├── megatron_bert/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_megatron_bert.py
│           │   │   ├── megatron_gpt2/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_megatron_gpt2.py
│           │   │   ├── mluke/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_tokenization_mluke.py
│           │   │   ├── mobilebert/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_mobilebert.py
│           │   │   │   ├── test_modeling_tf_mobilebert.py
│           │   │   │   └── test_tokenization_mobilebert.py
│           │   │   ├── mobilevit/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_mobilevit.py
│           │   │   │   ├── test_modeling_mobilevit.py
│           │   │   │   └── test_modeling_tf_mobilevit.py
│           │   │   ├── mpnet/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_mpnet.py
│           │   │   │   ├── test_modeling_tf_mpnet.py
│           │   │   │   └── test_tokenization_mpnet.py
│           │   │   ├── mt5/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_flax_mt5.py
│           │   │   │   ├── test_modeling_mt5.py
│           │   │   │   └── test_modeling_tf_mt5.py
│           │   │   ├── mvp/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_mvp.py
│           │   │   │   └── test_tokenization_mvp.py
│           │   │   ├── nezha/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_nezha.py
│           │   │   ├── nllb/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_tokenization_nllb.py
│           │   │   ├── nystromformer/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_nystromformer.py
│           │   │   ├── openai/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_openai.py
│           │   │   │   ├── test_modeling_tf_openai.py
│           │   │   │   └── test_tokenization_openai.py
│           │   │   ├── opt/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_flax_opt.py
│           │   │   │   ├── test_modeling_opt.py
│           │   │   │   └── test_modeling_tf_opt.py
│           │   │   ├── owlvit/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_owlvit.py
│           │   │   │   ├── test_modeling_owlvit.py
│           │   │   │   └── test_processor_owlvit.py
│           │   │   ├── pegasus/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_flax_pegasus.py
│           │   │   │   ├── test_modeling_pegasus.py
│           │   │   │   ├── test_modeling_tf_pegasus.py
│           │   │   │   └── test_tokenization_pegasus.py
│           │   │   ├── pegasus_x/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_pegasus_x.py
│           │   │   ├── perceiver/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_perceiver.py
│           │   │   │   └── test_tokenization_perceiver.py
│           │   │   ├── phobert/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_tokenization_phobert.py
│           │   │   ├── plbart/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_plbart.py
│           │   │   │   └── test_tokenization_plbart.py
│           │   │   ├── poolformer/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_poolformer.py
│           │   │   │   └── test_modeling_poolformer.py
│           │   │   ├── prophetnet/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_prophetnet.py
│           │   │   │   └── test_tokenization_prophetnet.py
│           │   │   ├── qdqbert/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_qdqbert.py
│           │   │   ├── rag/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_rag.py
│           │   │   │   ├── test_modeling_tf_rag.py
│           │   │   │   ├── test_retrieval_rag.py
│           │   │   │   └── test_tokenization_rag.py
│           │   │   ├── realm/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_realm.py
│           │   │   │   ├── test_retrieval_realm.py
│           │   │   │   └── test_tokenization_realm.py
│           │   │   ├── reformer/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_reformer.py
│           │   │   │   └── test_tokenization_reformer.py
│           │   │   ├── regnet/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_regnet.py
│           │   │   │   └── test_modeling_tf_regnet.py
│           │   │   ├── rembert/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_rembert.py
│           │   │   │   └── test_modeling_tf_rembert.py
│           │   │   ├── resnet/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_resnet.py
│           │   │   │   └── test_modeling_tf_resnet.py
│           │   │   ├── retribert/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_tokenization_retribert.py
│           │   │   ├── roberta/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_flax_roberta.py
│           │   │   │   ├── test_modeling_roberta.py
│           │   │   │   ├── test_modeling_tf_roberta.py
│           │   │   │   └── test_tokenization_roberta.py
│           │   │   ├── roformer/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_flax_roformer.py
│           │   │   │   ├── test_modeling_roformer.py
│           │   │   │   ├── test_modeling_tf_roformer.py
│           │   │   │   └── test_tokenization_roformer.py
│           │   │   ├── segformer/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_segformer.py
│           │   │   │   ├── test_modeling_segformer.py
│           │   │   │   └── test_modeling_tf_segformer.py
│           │   │   ├── sew/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_sew.py
│           │   │   ├── sew_d/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_sew_d.py
│           │   │   ├── speech_encoder_decoder/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_flax_speech_encoder_decoder.py
│           │   │   │   └── test_modeling_speech_encoder_decoder.py
│           │   │   ├── speech_to_text/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_speech_to_text.py
│           │   │   │   ├── test_modeling_speech_to_text.py
│           │   │   │   ├── test_modeling_tf_speech_to_text.py
│           │   │   │   ├── test_processor_speech_to_text.py
│           │   │   │   └── test_tokenization_speech_to_text.py
│           │   │   ├── speech_to_text_2/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_speech_to_text_2.py
│           │   │   │   └── test_tokenization_speech_to_text_2.py
│           │   │   ├── splinter/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_splinter.py
│           │   │   ├── squeezebert/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_squeezebert.py
│           │   │   │   └── test_tokenization_squeezebert.py
│           │   │   ├── swin/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_swin.py
│           │   │   │   └── test_modeling_tf_swin.py
│           │   │   ├── swinv2/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_swinv2.py
│           │   │   ├── t5/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_flax_t5.py
│           │   │   │   ├── test_modeling_t5.py
│           │   │   │   ├── test_modeling_tf_t5.py
│           │   │   │   └── test_tokenization_t5.py
│           │   │   ├── table_transformer/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_table_transformer.py
│           │   │   ├── tapas/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_tapas.py
│           │   │   │   ├── test_modeling_tf_tapas.py
│           │   │   │   └── test_tokenization_tapas.py
│           │   │   ├── tapex/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_tokenization_tapex.py
│           │   │   ├── time_series_transformer/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_time_series_transformer.py
│           │   │   ├── trajectory_transformer/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_trajectory_transformer.py
│           │   │   ├── transfo_xl/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_tf_transfo_xl.py
│           │   │   │   ├── test_modeling_transfo_xl.py
│           │   │   │   └── test_tokenization_transfo_xl.py
│           │   │   ├── trocr/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_trocr.py
│           │   │   ├── unispeech/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_unispeech.py
│           │   │   ├── unispeech_sat/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_unispeech_sat.py
│           │   │   ├── van/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_van.py
│           │   │   ├── videomae/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_videomae.py
│           │   │   │   └── test_modeling_videomae.py
│           │   │   ├── vilt/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_vilt.py
│           │   │   │   └── test_modeling_vilt.py
│           │   │   ├── vision_encoder_decoder/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_flax_vision_encoder_decoder.py
│           │   │   │   ├── test_modeling_tf_vision_encoder_decoder.py
│           │   │   │   └── test_modeling_vision_encoder_decoder.py
│           │   │   ├── vision_text_dual_encoder/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_flax_vision_text_dual_encoder.py
│           │   │   │   ├── test_modeling_vision_text_dual_encoder.py
│           │   │   │   └── test_processor_vision_text_dual_encoder.py
│           │   │   ├── visual_bert/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_visual_bert.py
│           │   │   ├── vit/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_vit.py
│           │   │   │   ├── test_modeling_flax_vit.py
│           │   │   │   ├── test_modeling_tf_vit.py
│           │   │   │   └── test_modeling_vit.py
│           │   │   ├── vit_mae/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_tf_vit_mae.py
│           │   │   │   └── test_modeling_vit_mae.py
│           │   │   ├── vit_msn/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_vit_msn.py
│           │   │   ├── wav2vec2/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_wav2vec2.py
│           │   │   │   ├── test_modeling_flax_wav2vec2.py
│           │   │   │   ├── test_modeling_tf_wav2vec2.py
│           │   │   │   ├── test_modeling_wav2vec2.py
│           │   │   │   ├── test_processor_wav2vec2.py
│           │   │   │   └── test_tokenization_wav2vec2.py
│           │   │   ├── wav2vec2_conformer/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_wav2vec2_conformer.py
│           │   │   ├── wav2vec2_phoneme/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_tokenization_wav2vec2_phoneme.py
│           │   │   ├── wav2vec2_with_lm/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_processor_wav2vec2_with_lm.py
│           │   │   ├── wavlm/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_wavlm.py
│           │   │   ├── whisper/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_whisper.py
│           │   │   │   ├── test_modeling_tf_whisper.py
│           │   │   │   ├── test_modeling_whisper.py
│           │   │   │   ├── test_processor_whisper.py
│           │   │   │   └── test_tokenization_whisper.py
│           │   │   ├── x_clip/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_x_clip.py
│           │   │   ├── xglm/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_flax_xglm.py
│           │   │   │   ├── test_modeling_tf_xglm.py
│           │   │   │   ├── test_modeling_xglm.py
│           │   │   │   └── test_tokenization_xglm.py
│           │   │   ├── xlm/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_tf_xlm.py
│           │   │   │   ├── test_modeling_xlm.py
│           │   │   │   └── test_tokenization_xlm.py
│           │   │   ├── xlm_prophetnet/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_xlm_prophetnet.py
│           │   │   │   └── test_tokenization_xlm_prophetnet.py
│           │   │   ├── xlm_roberta/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_flax_xlm_roberta.py
│           │   │   │   ├── test_modeling_tf_xlm_roberta.py
│           │   │   │   ├── test_modeling_xlm_roberta.py
│           │   │   │   └── test_tokenization_xlm_roberta.py
│           │   │   ├── xlm_roberta_xl/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_xlm_roberta_xl.py
│           │   │   ├── xlnet/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_tf_xlnet.py
│           │   │   │   ├── test_modeling_xlnet.py
│           │   │   │   └── test_tokenization_xlnet.py
│           │   │   ├── yolos/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_yolos.py
│           │   │   │   └── test_modeling_yolos.py
│           │   │   └── yoso/
│           │   │       ├── __init__.py
│           │   │       └── test_modeling_yoso.py
│           │   ├── onnx/
│           │   │   ├── __init__.py
│           │   │   ├── test_features.py
│           │   │   ├── test_onnx.py
│           │   │   └── test_onnx_v2.py
│           │   ├── optimization/
│           │   │   ├── __init__.py
│           │   │   ├── test_optimization.py
│           │   │   └── test_optimization_tf.py
│           │   ├── pipelines/
│           │   │   ├── __init__.py
│           │   │   ├── test_pipelines_audio_classification.py
│           │   │   ├── test_pipelines_automatic_speech_recognition.py
│           │   │   ├── test_pipelines_common.py
│           │   │   ├── test_pipelines_conversational.py
│           │   │   ├── test_pipelines_depth_estimation.py
│           │   │   ├── test_pipelines_document_question_answering.py
│           │   │   ├── test_pipelines_feature_extraction.py
│           │   │   ├── test_pipelines_fill_mask.py
│           │   │   ├── test_pipelines_image_classification.py
│           │   │   ├── test_pipelines_image_segmentation.py
│           │   │   ├── test_pipelines_image_to_text.py
│           │   │   ├── test_pipelines_object_detection.py
│           │   │   ├── test_pipelines_question_answering.py
│           │   │   ├── test_pipelines_summarization.py
│           │   │   ├── test_pipelines_table_question_answering.py
│           │   │   ├── test_pipelines_text2text_generation.py
│           │   │   ├── test_pipelines_text_classification.py
│           │   │   ├── test_pipelines_text_generation.py
│           │   │   ├── test_pipelines_token_classification.py
│           │   │   ├── test_pipelines_translation.py
│           │   │   ├── test_pipelines_visual_question_answering.py
│           │   │   ├── test_pipelines_zero_shot.py
│           │   │   ├── test_pipelines_zero_shot_image_classification.py
│           │   │   └── test_pipelines_zero_shot_object_detection.py
│           │   ├── repo_utils/
│           │   │   ├── test_check_copies.py
│           │   │   ├── test_check_dummies.py
│           │   │   └── test_tests_fetcher.py
│           │   ├── sagemaker/
│           │   │   ├── README.md
│           │   │   ├── __init__.py
│           │   │   ├── conftest.py
│           │   │   ├── scripts/
│           │   │   │   ├── pytorch/
│           │   │   │   │   ├── requirements.txt
│           │   │   │   │   ├── run_ddp.py
│           │   │   │   │   └── run_glue_model_parallelism.py
│           │   │   │   └── tensorflow/
│           │   │   │       ├── requirements.txt
│           │   │   │       ├── run_tf.py
│           │   │   │       └── run_tf_dist.py
│           │   │   ├── test_multi_node_data_parallel.py
│           │   │   ├── test_multi_node_model_parallel.py
│           │   │   └── test_single_node_gpu.py
│           │   ├── test_configuration_common.py
│           │   ├── test_feature_extraction_common.py
│           │   ├── test_image_transforms.py
│           │   ├── test_modeling_common.py
│           │   ├── test_modeling_flax_common.py
│           │   ├── test_modeling_tf_common.py
│           │   ├── test_sequence_feature_extraction_common.py
│           │   ├── test_tokenization_common.py
│           │   ├── tokenization/
│           │   │   ├── __init__.py
│           │   │   ├── test_tokenization_fast.py
│           │   │   └── test_tokenization_utils.py
│           │   ├── trainer/
│           │   │   ├── __init__.py
│           │   │   ├── test_data_collator.py
│           │   │   ├── test_trainer.py
│           │   │   ├── test_trainer_callback.py
│           │   │   ├── test_trainer_distributed.py
│           │   │   ├── test_trainer_seq2seq.py
│           │   │   ├── test_trainer_tpu.py
│           │   │   └── test_trainer_utils.py
│           │   └── utils/
│           │       ├── __init__.py
│           │       ├── test_activations.py
│           │       ├── test_activations_tf.py
│           │       ├── test_add_new_model_like.py
│           │       ├── test_cli.py
│           │       ├── test_convert_slow_tokenizer.py
│           │       ├── test_doc_samples.py
│           │       ├── test_file_utils.py
│           │       ├── test_generic.py
│           │       ├── test_hf_argparser.py
│           │       ├── test_hub_utils.py
│           │       ├── test_image_utils.py
│           │       ├── test_logging.py
│           │       ├── test_model_card.py
│           │       ├── test_model_output.py
│           │       ├── test_modeling_tf_core.py
│           │       ├── test_offline.py
│           │       ├── test_skip_decorators.py
│           │       └── test_versions_utils.py
│           └── utils/
│               ├── check_config_docstrings.py
│               ├── check_copies.py
│               ├── check_doc_toc.py
│               ├── check_dummies.py
│               ├── check_inits.py
│               ├── check_repo.py
│               ├── check_self_hosted_runner.py
│               ├── check_table.py
│               ├── check_tf_ops.py
│               ├── create_dummy_models.py
│               ├── custom_init_isort.py
│               ├── documentation_tests.txt
│               ├── download_glue_data.py
│               ├── get_ci_error_statistics.py
│               ├── get_github_job_time.py
│               ├── get_modified_files.py
│               ├── notification_service.py
│               ├── notification_service_doc_tests.py
│               ├── past_ci_versions.py
│               ├── prepare_for_doc_test.py
│               ├── print_env.py
│               ├── release.py
│               ├── sort_auto_mappings.py
│               ├── test_module/
│               │   ├── __init__.py
│               │   ├── custom_configuration.py
│               │   ├── custom_feature_extraction.py
│               │   ├── custom_modeling.py
│               │   ├── custom_pipeline.py
│               │   ├── custom_processing.py
│               │   ├── custom_tokenization.py
│               │   └── custom_tokenization_fast.py
│               ├── tests_fetcher.py
│               ├── tf_ops/
│               │   └── onnx.json
│               └── update_metadata.py
├── docs/
│   ├── disk_commands.txt
│   ├── gcp_setup.md
│   └── paper.md
├── experimental/
│   ├── cost_model.py
│   └── fit_cost_model.py
├── flexllmgen/
│   ├── __init__.py
│   ├── apps/
│   │   ├── README.md
│   │   ├── __init__.py
│   │   ├── completion.py
│   │   ├── data_wrangle/
│   │   │   ├── README.md
│   │   │   ├── data_wrangle_run.py
│   │   │   ├── install.sh
│   │   │   ├── test_batch_query_all_opt175b.sh
│   │   │   ├── test_batch_query_all_opt30b.sh
│   │   │   ├── test_batch_query_all_opt6.7b.sh
│   │   │   ├── test_batch_query_case.sh
│   │   │   ├── test_single_query_all_opt6.7b.sh
│   │   │   ├── test_single_query_case.sh
│   │   │   └── utils/
│   │   │       ├── constants.py
│   │   │       ├── data_utils.py
│   │   │       ├── prompt_utils.py
│   │   │       └── utils.py
│   │   ├── helm_fast_test.py
│   │   ├── helm_passed_30b.sh
│   │   └── helm_run.py
│   ├── compression.py
│   ├── dist_flex_opt.py
│   ├── dist_utils.py
│   ├── flex_opt.py
│   ├── opt_config.py
│   ├── profile_bandwidth.py
│   ├── profile_matmul.py
│   ├── pytorch_backend.py
│   ├── timer.py
│   └── utils.py
├── pyproject.toml
└── scripts/
    ├── mount_nvme_aws.sh
    ├── mount_nvme_gcp.sh
    ├── step_2_consolidate_992_shards_to_singleton.py
    ├── step_3_convert_to_numpy_weights.py
    ├── upload_pypi.sh
    └── utils.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
# Mac system files
.DS_store

# built binaries
benchmark/third_party/pagecache-mangagement/trunk/fadv
benchmark/third_party/pagecache-mangagement/trunk/*.so
benchmark/third_party/pagecache-mangagement/trunk/sfr
benchmark/third_party/pagecache-mangagement/trunk/Makefile
dist

# vscode & VIM & JetBrain
.vscode/
.idea
*.swp

# cache
*__pycache__
*.egg-info
flexllmgen/apps/data
flexllmgen/apps/runs
flexllmgen/apps/benchmark_output
flexllmgen/apps/data_wrangle/data
flexllmgen/apps/data_wrangle/outputs
flexllmgen/apps/data_wrangle/core

# pickle
*.pkl

# log files
*.tsv
*.log
*.raw

# tmp scripts
today_job.sh


================================================
FILE: LICENSE
================================================
Copyright 2023 - The FlexLLMGen team. All rights reserved.

                                  Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: README.md
================================================
# FlexLLMGen: High-throughput Generative Inference of Large Language Models with a Single GPU [[paper](https://arxiv.org/abs/2303.06865)]

FlexLLMGen is a high-throughput generation engine for running large language models with limited GPU memory. FlexLLMGen allows **high-throughput** generation by IO-efficient offloading, compression, and **large effective batch sizes**.

## Motivation

In recent years, large language models (LLMs) have shown great performance across a 
wide range of tasks. Increasingly, LLMs have been applied not only to interactive 
applications (such as chat), but also to many "back-of-house" tasks.
These tasks include benchmarking, information extraction, data wrangling, and form processing.

One key characteristic of these applications is that they are **throughput-oriented**: they require
running LLM inferences over millions of tokens in batches, e.g., all the private documents in a company's
corpus, or all the tasks in the [HELM](https://crfm.stanford.edu/helm/latest/) benchmark.
These workloads are less sensitive to latency - the user starts up a job and lets it run overnight -
but increasing throughput is critical for reducing costs.
Throughput is a measure of tokens processed per second over the job's entire runtime (which can be hours).
Throughput-oriented workloads provide opportunities to trade off latency for higher throughput, which
makes it easier to take advantage of low-cost commodity GPUs. 

The goal of FlexLLMGen is to create a high-throughput system to enable new and exciting applications of 
foundation models to throughput-oriented tasks on low-cost hardware, such as a single commodity GPU
instead of expensive systems.

Check out the [examples](#examples) of what you can run on a single commodity GPU with FlexLLMGen, including benchmarking and data wrangling.

❌ **Limitation**. As an offloading-based system running on weak GPUs, FlexLLMGen also has its limitations.
FlexLLMGen can be significantly slower than the case when you have enough powerful GPUs to hold the whole model, especially for small-batch cases.
FlexLLMGen is mostly optimized for throughput-oriented batch processing settings (e.g., classifying or extracting information from many documents in batches), on single GPUs.

----------

This project was made possible thanks to a collaboration with

<a href="https://cs.stanford.edu/"><img src="https://identity.stanford.edu/wp-content/uploads/sites/3/2020/06/wordmark-nospace-red.png" height="20"></a> &nbsp;&nbsp;&nbsp;
<a href="https://sky.cs.berkeley.edu/"><img src="https://upload.wikimedia.org/wikipedia/commons/thumb/8/82/University_of_California%2C_Berkeley_logo.svg/1280px-University_of_California%2C_Berkeley_logo.svg.png" height="22"></a> &nbsp;&nbsp;&nbsp;
<a href="https://www.andrew.cmu.edu/user/beidic/"><img src="https://upload.wikimedia.org/wikipedia/commons/9/9b/Carnegie_Mellon_wordmark.svg" height="20"></a> &nbsp;&nbsp;&nbsp;
<a href="https://www.together.xyz/"><img src="https://images.squarespace-cdn.com/content/v1/6358bea282189a0adf57fe16/eef09191-631f-40d9-9bfd-f875b25bcf0b/together-logo-black-transparent2.png" height="20"></a> &nbsp;&nbsp;&nbsp;
<a href="https://research.yandex.com/"><img src="https://storage.yandexcloud.net/yandex-research/assets/yandex_research.png" height="20"></a> &nbsp;&nbsp;&nbsp;
<a href="https://ds3lab.inf.ethz.ch/"><img src="https://user-images.githubusercontent.com/1608867/220273382-c09669b3-42fd-47c2-b88c-7ed55cb43820.png" height="20"></a>

----------

## Content
- [Installation](#installation)
- [Usage and Examples](#usage-and-examples)
  - [Get Started with a Single GPU](#get-started-with-a-single-gpu)
  - [Run HELM Benchmark with FlexLLMGen](#run-helm-benchmark-with-flexllmgen)
  - [Run Data Wrangling Tasks with FlexLLMGen](#run-data-wrangling-tasks-with-flexllmgen)
  - [Scaling to Distributed GPUs](#scaling-to-distributed-gpus)
  - [API Example](#api-example)
  - [Frequently Asked Questions](#frequently-asked-questions)
- [Performance Results](#performance-results)
- [How It Works](#how-it-works)
- [Roadmap](#roadmap)

## Installation
Requirements:  
 - PyTorch >= 1.12 [(Help)](https://pytorch.org/get-started/locally/)

### Method 1: With pip
```
pip install flexllmgen
```

### Method 2: From source
```
git clone https://github.com/FMInference/FlexLLMGen.git
cd FlexLLMGen
pip install -e .
```

## Usage and Examples

### Get Started with a Single GPU

#### OPT-1.3B
To get started, you can try a small model like OPT-1.3B first. It fits into a single GPU so no offloading is required.
FlexLLMGen will automatically download weights from Hugging Face.
```
python3 -m flexllmgen.flex_opt --model facebook/opt-1.3b
```

You should see some text generated by OPT-1.3B and the benchmark results.

#### OPT-30B
To run large models like OPT-30B, you will need to use CPU offloading. You can try commands below.
The `--percent` argument specifies the offloading strategy for parameters, attention cache and hidden states separately.
The exact meaning of this argument can be found [here](https://github.com/FMInference/FlexLLMGen/blob/9d092d848f106cd9eaf305c12ef3590f7bcb0277/flexllmgen/flex_opt.py#L1271-L1279).
```
python3 -m flexllmgen.flex_opt --model facebook/opt-30b --percent 0 100 100 0 100 0
```

#### OPT-175B
To run OPT-175B, you need to download the weights from [metaseq](https://github.com/facebookresearch/metaseq/tree/main/projects/OPT) and convert the weights into Alpa [format](https://alpa.ai/tutorials/opt_serving.html#convert-opt-175b-weights-into-alpa-formats).
You can then try to offloading all weights to disk by
```
python3 -m flexllmgen.flex_opt --model facebook/opt-175b --percent 0 0 100 0 100 0 --offload-dir YOUR_SSD_FOLDER
```

### Run HELM Benchmark with FlexLLMGen
FlexLLMGen can be integrated into [HELM](https://crfm.stanford.edu/helm), a language model benchmark framework, as its execution backend.
You can use the commands below to run a Massive Multitask Language Understanding (MMLU) [scenario](https://crfm.stanford.edu/helm/latest/?group=mmlu) with a single T4 (16GB) GPU and 200GB of DRAM.
```
pip install crfm-helm
python3 -m flexllmgen.apps.helm_run --description mmlu:model=text,subject=abstract_algebra,data_augmentation=canonical --pad-to-seq-len 512 --model facebook/opt-30b --percent 20 80 0 100 0 100 --gpu-batch-size 48 --num-gpu-batches 3 --max-eval-instance 100
```
Note that only a subset of HELM scenarios is tested. See more tested scenarios [here](flexllmgen/apps/helm_passed_30b.sh).

### Run Data Wrangling Tasks with FlexLLMGen
You can run the examples in this paper, ['Can Foundation Models Wrangle Your Data?'](https://arxiv.org/abs/2205.09911), by following the instructions [here](flexllmgen/apps/data_wrangle).

### Scaling to Distributed GPUs
If you have multiple machines with GPUs, FlexLLMGen can combine offloading with pipeline parallelism to allow scaling.
For example, if you have 2 GPUs but the aggregated GPU memory is less than the model size, you still need offloading. FlexLLMGen allow you to do pipeline parallelism with these 2 GPUs to accelerate the generation.
But to have scaled performance, you should have GPUs on distributed machines.
See examples [here](https://github.com/FMInference/FlexLLMGen/tree/main/benchmark/flexllmgen#distributed-gpus).

### API Example
We demonstrate the usage of FlexLLMGen API in [completion.py](flexllmgen/apps/completion.py).
This example shows how to run generation for two sentences.
To get the best throughput out of FlexLLMGen, you typically need to batch more sentences.

#### Generation API
FlexLLMGen has a generation API following the style of Hugging Face's transformers.
```python
output_ids = model.generate(
	input_ids,
	do_sample=True,
	temperature=0.7,
	max_new_tokens=32,
	stop=stop)
```

#### Example Commands
You can use the example commands below.
If you do not have enough GPU/CPU memory, see the [Handle Out-Of-Memory](#handle-out-of-memory) section.

```
# Complete with OPT-6.7B. You need at least 15GB of GPU memory.
python3 -m flexllmgen.apps.completion --model facebook/opt-6.7b
```

```
# Complete with OPT-30B. You need about 90GB of CPU memory.
python3 -m flexllmgen.apps.completion --model facebook/opt-30b --percent 0 100 100 0 100 0
```

```
# Complete with instruction-tuned OPT-IML-MAX-30B. You need about 90GB of CPU memory.
python3 -m flexllmgen.apps.completion --model facebook/opt-iml-max-30b --percent 0 100 100 0 100 0
```

### Frequently Asked Questions

#### How to set the offloading strategy and `--percent`?
We will release an automatic policy optimizer later, but now you have to manually try a few strategies.
The idea of high-throughput generation is to offload parameters and attention cache as much as possible to the CPU and disk if necessary.
You can see the reference strategies in our benchmark [here](https://github.com/FMInference/FlexLLMGen/blob/9d092d848f106cd9eaf305c12ef3590f7bcb0277/benchmark/flexllmgen/bench_suite.py#L39-L79).
To avoid out-of-memory, you can tune the `--percent` to offload more tensors to the CPU and disk.


#### How to handle out-of-memory?
If you do not have enough GPU/CPU memory, here are a few things you can try.
They save more memory but run slower.

- Do not pin weights by adding `--pin-weight 0`. This can reduce the weight memory usage on CPU by around 20% or more.
- Enable weight compression by adding `--compress-weight`. This can reduce the weight memory usage by around 70%.
- Offload all weights to disk by using `--percent 0 0 100 0 100 0`. This requires very little CPU and GPU memory.

## Performance Results
### Generation Throughput (token/s)
The corresponding effective batch sizes and lowest offloading devices are in parentheses. Please see [here](benchmark/batch_size_table.md) for more details.
| System | OPT-6.7B | OPT-30B | OPT-175B |
| ------ | -------- | ------- | -------- |
| Hugging Face Accelerate  | 25.12 (2 on GPU)  | 0.62 (8 on CPU) | 0.01 (2 on disk) |
| DeepSpeed ZeRO-Inference | 9.28 (16 on CPU)  | 0.60 (4 on CPU) | 0.01 (1 on disk) |
| Petals                 | 8.25 (2 on GPU) | 2.84 (2 on GPU) | 0.08 (2 on GPU) |
| FlexLLMGen                  | 25.26 (2 on GPU) | 7.32 (144 on CPU) | 0.69 (256 on disk) |
| FlexLLMGen with Compression | **29.12** (72 on GPU) | **8.38** (512 on CPU) | **1.12** (144 on CPU) |

- Hardware: an NVIDIA T4 (16GB) instance on GCP with 208GB of DRAM and 1.5TB of SSD.  
- Workload: input sequence length = 512, output sequence length = 32. The batch size is tuned to **a large value** that maximizes the generation throughput for each system.
- Metric: generation throughput (token/s) = number of the generated tokens / (time for processing prompts + time for generation).  

How to [reproduce](benchmark/flexllmgen).

### Latency-Throughput Trade-Off
The figure below shows the latency and throughput trade-off of three offloading-based systems on OPT-175B (left) and OPT-30B (right).
FlexLLMGen achieves a new Pareto-optimal frontier with significantly higher maximum throughput for both models.
Other systems cannot further increase throughput due to out-of-memory.
"FlexLLMGen(c)" is FlexLLMGen with compression.

<img src="https://github.com/FMInference/FlexLLMGen/blob/main/docs/throughput_vs_latency.jpg" alt="image" width="500"></img>

## How It Works
FlexLLMGen can be flexibly configured under various hardware resource constraints by aggregating memory and computation from the GPU, CPU, and disk. Through a linear programming optimizer, it searches for the best pattern to store and access the tensors, including weights, activations, and attention key/value (KV) cache. FlexLLMGen further compresses both weights and KV cache to 4 bits with negligible accuracy loss.

One key idea of FlexLLMGen is to play the latency-throughput trade-off. Achieving low latency is inherently challenging for offloading methods,
but the I/O efficiency of offloading can be greatly boosted for throughput-oriented scenarios (see the figure above).
FlexLLMGen utilizes a block schedule to reuse weight and overlap I/O with computation, as shown in figure (b) below, while other baseline systems use an inefficient row-by-row schedule, as shown in figure (a) below.

<img src="https://github.com/FMInference/FlexLLMGen/raw/main/docs/block_schedule.jpg" alt="image" width="500"></img>

More technical details see our [paper](https://arxiv.org/abs/2303.06865).

## Roadmap
We plan to work on the following features.

- [ ] Optimize the performance for multiple GPUs on the same machine
- [ ] Support more models (BLOOM, CodeGen, GLM)
- [X] Release the cost model and policy optimizer
- [ ] Macbook Support (M1 and M2)
- [ ] AMD Support


================================================
FILE: benchmark/batch_size_table.md
================================================
## Effective Batch Size of Each System

### Setup
- Hardware: an NVIDIA T4 (16GB) instance on GCP with 208GB of DRAM and 1.5TB of SSD.  
- Workload: input sequence length = 512, output sequence length = 32.

### Effective Batch Size

The table below lists the effective batch size of each system.
The device in the bracket denotes the lowest level of memory hierarchy that the system needs for offloading.
The batch size is tuned for each system to achieve its maximum throughput with the following principle:
- Find a level of memory hierarchy that can hold all tensors for generation. Avoid unnecessary offloading to slower storage.
- Tune the system to use a as large as possible batch size without out-of-memory.

| System | OPT-6.7B | OPT-30B | OPT-175B |
| ------ | -------- | ------- | -------- |
| Hugging Face Accelerate  | 2  (gpu) | 8 (cpu)   | 2 (disk)   |
| DeepSpeed ZeRO-Inference | 16 (cpu) | 4 (cpu)   | 1 (disk)   |
| FlexLLMGen                  | 2  (gpu) | 144 (cpu) | 256 (disk) |
| FlexLLMGen with Compression | 72 (gpu) | 512 (cpu) | 144 (cpu)  |

### Generation Throughput (token/s)
We attach the generation throughput here for reference.

| System | OPT-6.7B | OPT-30B | OPT-175B |
| ------ | -------- | ------- | -------- |
| Hugging Face Accelerate   | 25.12 | 0.62 | 0.01 |
| DeepSpeed ZeRO-Inference | 9.28  | 0.60 | 0.01 |
| FlexLLMGen                  | 25.26 | 7.32 | 0.69 |
| FlexLLMGen with Compression | **29.12** | **8.38** | **1.12** |

### About Petals
We also include [Petals](https://arxiv.org/abs/2209.01188) as an additional baseline.
We measure the results of running OPT hosted on 1, 4, and 24 T4 GPUs (in case of 6.7B, 30B, and 175B respectively) on GCP.
We perform 6 parallel requests to the system and divide the throughput by the number of used GPUs in each case.
For a more comprehensive comparison with Petals, see Section 6.3 in our paper.


================================================
FILE: benchmark/flexgen/bench_scan_175b.sh
================================================
python3 -m flexgen.flex_opt --model facebook/opt-175b --path _DUMMY_ --pin-weight 0 --percent 0 100 100 0 100 0 --gpu-batch-size 1 --gen-len 1 --sep-layer 0


================================================
FILE: benchmark/flexllmgen/README.md
================================================
# Benchmark FlexLLMGen
NOTE: This benchmark uses dummy weights by default for faster experiments.
It is expected if you see randomly generated garbled characters, but the throughput and latency numbers should be correct.

## Mount SSD
The following commands use `~/flexllmgen_offload_dir` as the offloading folder by default.
To get the best performance, it is recommonded to mount this folder on a fast SSD.
If you use AWS or GCP instances with local SSDs, you can use [mount_nvme_aws.sh](../../scripts/mount_nvme_aws.sh) or [mount_nvme_gcp.sh](../../scripts/mount_nvme_gcp.sh) to mount the local SSDs.

## Single GPU

### OPT-6.7B
```
# fp16
python3 bench_suite.py 6b7_1x1

# with int4 compression
python3 bench_suite.py 6b7_1x1_comp
```

### OPT-30B
```
# fp16
python3 bench_suite.py 30b_1x1

# with int4 compression
python3 bench_suite.py 30b_1x1_comp
```

### OPT-175B
```
# fp16
python3 bench_suite.py 175b_1x1

# with int4 compression
python3 bench_suite.py 175b_1x1_comp
```

## Distributed GPUs

### Requirements
```
sudo apt install openmpi-bin
```

### OPT-6.7B
```
# 1 node with 4 GPUs
bash bench_6.7b_1x4.sh

# 4 nodes and one GPU per node
bash bench_6.7b_4x1.sh
```

### OPT-30B
```
# 1 node with 4 GPUs
bash bench_30b_1x4.sh

# 4 nodes and one GPU per node
bash bench_30b_4x1.sh
```

### OPT-175B
```
# 1 node with 4 GPUs
bash bench_175b_1x4.sh

# 4 nodes and one GPU per node
bash bench_175b_4x1.sh
```


================================================
FILE: benchmark/flexllmgen/bench_175b_1x4.sh
================================================
#!/bin/bash

MY_IPADDR=$(hostname -i)
all_hosts=$MY_IPADDR
N_GPUS=4
N_CORES_PER_GPU=12

PYTHON_EXEC=$CONDA_PREFIX/bin/python
PYTHON_SCRIPT=flexllmgen.dist_flex_opt

pgrep -fl python | awk '!/dist_flex_opt\.py/{print $1}' | xargs sudo kill

set -x

mpirun \
  --mca btl_tcp_if_exclude lo,docker0 \
  --mca oob_tcp_if_exclude lo,docker0 \
  --map-by ppr:$N_GPUS:node:pe=$N_CORES_PER_GPU --oversubscribe -H $all_hosts \
  --bind-to core -x OMP_NUM_THREADS=$N_CORES_PER_GPU \
  $PYTHON_EXEC -m $PYTHON_SCRIPT \
    --head-ip $MY_IPADDR \
    --port 7777 \
    --use-mpi \
    --model facebook/opt-175b \
    --gpu-batch-size 20 \
    --percent 0 100 0 100 0 100 \
    --comm-device cpu \
    --path _DUMMY_ \
    --cut-gen-len 5 \
    --pin-weight 0 \
    --cpu


================================================
FILE: benchmark/flexllmgen/bench_175b_4x1.sh
================================================
#!/bin/bash

N_GPUS=1
N_NODES=4
N_CORES_PER_GPU=16

MY_IPADDR=$(hostname -i)
all_public_ips=$(ray get-worker-ips ~/ray_bootstrap_config.yaml)
for s in $all_public_ips; do
    ssh -o StrictHostKeyChecking=no $s hostname -i > /tmp/$s.ip &
done
wait
for s in $all_public_ips; do
    OTHERS_IPADDR+=($(cat /tmp/$s.ip))
done
ALL_IPADDR=($MY_IPADDR ${OTHERS_IPADDR[@]})
all_hosts=$(echo ${ALL_IPADDR[@]:0:$N_NODES} | sed 's/ /,/g')

PYTHON_EXEC=$CONDA_PREFIX/bin/python
PYTHON_SCRIPT=flexllmgen.dist_flex_opt

pgrep -fl python | awk '!/dist_flex_opt\.py/{print $1}' | xargs sudo kill

set -x

mpirun \
  --mca btl_tcp_if_exclude lo,docker0 \
  --mca oob_tcp_if_exclude lo,docker0 \
  --map-by ppr:$N_GPUS:node:pe=$N_CORES_PER_GPU --oversubscribe -H $all_hosts \
  --bind-to core -x OMP_NUM_THREADS=$N_CORES_PER_GPU \
  $PYTHON_EXEC -m $PYTHON_SCRIPT \
    --head-ip $MY_IPADDR \
    --port 7777 \
    --use-mpi \
    --model facebook/opt-175b \
    --gpu-batch-size 40 \
    --num-inner-iterations 4 \
    --percent 0 100 0 100 0 100 \
    --comm-device cpu \
    --path _DUMMY_ \
    --cut-gen-len 5 \
    --pin-weight 0 \
    --cpu \
    --async-comm


================================================
FILE: benchmark/flexllmgen/bench_30b_1x4.sh
================================================
#!/bin/bash

MY_IPADDR=$(hostname -i)
all_hosts=$MY_IPADDR
N_GPUS=4
N_CORES_PER_GPU=12

PYTHON_EXEC=$CONDA_PREFIX/bin/python
PYTHON_SCRIPT=flexllmgen.dist_flex_opt

pgrep -fl python | awk '!/dist_flex_opt\.py/{print $1}' | xargs sudo kill

set -x

mpirun \
  --mca btl_tcp_if_exclude lo,docker0 \
  --mca oob_tcp_if_exclude lo,docker0 \
  --map-by ppr:$N_GPUS:node:pe=$N_CORES_PER_GPU --oversubscribe -H $all_hosts \
  --bind-to core -x OMP_NUM_THREADS=$N_CORES_PER_GPU \
  $PYTHON_EXEC -m $PYTHON_SCRIPT \
    --head-ip $MY_IPADDR \
    --port 7777 \
    --use-mpi \
    --model facebook/opt-30b \
    --gpu-batch-size 72 \
    --percent 20 80 0 100 0 100 \
    --comm-device cpu \
    --path _DUMMY_ \
    --cut-gen-len 5 \
    --cpu


================================================
FILE: benchmark/flexllmgen/bench_30b_4x1.sh
================================================
#!/bin/bash

N_GPUS=1
N_NODES=4
N_CORES_PER_GPU=16

MY_IPADDR=$(hostname -i)
all_public_ips=$(ray get-worker-ips ~/ray_bootstrap_config.yaml)
for s in $all_public_ips; do
    ssh -o StrictHostKeyChecking=no $s hostname -i > /tmp/$s.ip &
done
wait
for s in $all_public_ips; do
    OTHERS_IPADDR+=($(cat /tmp/$s.ip))
done
ALL_IPADDR=($MY_IPADDR ${OTHERS_IPADDR[@]})
all_hosts=$(echo ${ALL_IPADDR[@]:0:$N_NODES} | sed 's/ /,/g')

PYTHON_EXEC=$CONDA_PREFIX/bin/python
PYTHON_SCRIPT=flexllmgen.dist_flex_opt

pgrep -fl python | awk '!/dist_flex_opt\.py/{print $1}' | xargs sudo kill

set -x

mpirun \
  --mca btl_tcp_if_exclude lo,docker0 \
  --mca oob_tcp_if_exclude lo,docker0 \
  --map-by ppr:$N_GPUS:node:pe=$N_CORES_PER_GPU --oversubscribe -H $all_hosts \
  --bind-to core -x OMP_NUM_THREADS=$N_CORES_PER_GPU \
  $PYTHON_EXEC -m $PYTHON_SCRIPT \
    --head-ip $MY_IPADDR \
    --port 7777 \
    --use-mpi \
    --model facebook/opt-30b \
    --num-inner-iterations 4 \
    --percent 20 80 0 100 0 100 --gpu-batch-size 64 --num-gpu-batches 3 \
    --comm-device cpu \
    --path _DUMMY_ \
    --cut-gen-len 5 \
    --cpu \
    --async-comm


================================================
FILE: benchmark/flexllmgen/bench_6.7b_1x4.sh
================================================
#!/bin/bash

MY_IPADDR=$(hostname -i)
all_hosts=$MY_IPADDR
N_GPUS=4
N_CORES_PER_GPU=6

PYTHON_EXEC=$CONDA_PREFIX/bin/python
PYTHON_SCRIPT=flexllmgen.dist_flex_opt

pgrep -fl python | awk '!/dist_flex_opt\.py/{print $1}' | xargs sudo kill

set -x

mpirun \
  --mca btl_tcp_if_exclude lo,docker0 \
  --mca oob_tcp_if_exclude lo,docker0 \
  --map-by ppr:$N_GPUS:node:pe=$N_CORES_PER_GPU --oversubscribe -H $all_hosts \
  --bind-to core -x OMP_NUM_THREADS=$N_CORES_PER_GPU \
  $PYTHON_EXEC -m $PYTHON_SCRIPT \
    --head-ip $MY_IPADDR \
    --port 7777 \
    --use-mpi \
    --model facebook/opt-6.7b \
    --gpu-batch-size 24 \
    --percent 100 0 100 0 100 0 \
    --comm-device cpu \
    --cut-gen-len 5 \
    --path _DUMMY_


================================================
FILE: benchmark/flexllmgen/bench_6.7b_4x1.sh
================================================
#!/bin/bash

N_GPUS=1
N_NODES=4
N_CORES_PER_GPU=16

MY_IPADDR=$(hostname -i)
all_public_ips=$(ray get-worker-ips ~/ray_bootstrap_config.yaml)
for s in $all_public_ips; do
    ssh -o StrictHostKeyChecking=no $s hostname -i > /tmp/$s.ip &
done
wait
for s in $all_public_ips; do
    OTHERS_IPADDR+=($(cat /tmp/$s.ip))
done
ALL_IPADDR=($MY_IPADDR ${OTHERS_IPADDR[@]})
all_hosts=$(echo ${ALL_IPADDR[@]:0:$N_NODES} | sed 's/ /,/g')

PYTHON_EXEC=$CONDA_PREFIX/bin/python
PYTHON_SCRIPT=flexllmgen.dist_flex_opt

pgrep -fl python | awk '!/dist_flex_opt\.py/{print $1}' | xargs sudo kill

set -x

mpirun \
  --mca btl_tcp_if_exclude lo,docker0 \
  --mca oob_tcp_if_exclude lo,docker0 \
  --map-by ppr:$N_GPUS:node:pe=$N_CORES_PER_GPU --oversubscribe -H $all_hosts \
  --bind-to core -x OMP_NUM_THREADS=$N_CORES_PER_GPU \
  $PYTHON_EXEC -m $PYTHON_SCRIPT \
    --head-ip $MY_IPADDR \
    --port 7777 \
    --use-mpi \
    --model facebook/opt-6.7b \
    --gpu-batch-size 24 \
    --percent 100 0 100 0 100 0 \
    --comm-device gpu \
    --cut-gen-len 5 \
    --path _DUMMY_


================================================
FILE: benchmark/flexllmgen/bench_dist_multi_node.sh
================================================
#!/bin/bash

N_GPUS=1
N_NODES=4
N_CORES_PER_GPU=16

MY_IPADDR=$(hostname -i)
all_public_ips=$(ray get-worker-ips ~/ray_bootstrap_config.yaml)
for s in $all_public_ips; do
    ssh -o StrictHostKeyChecking=no $s hostname -i > /tmp/$s.ip &
done
wait
for s in $all_public_ips; do
    OTHERS_IPADDR+=($(cat /tmp/$s.ip))
done
ALL_IPADDR=($MY_IPADDR ${OTHERS_IPADDR[@]})
all_hosts=$(echo ${ALL_IPADDR[@]:0:$N_NODES} | sed 's/ /,/g')

PYTHON_EXEC=$CONDA_PREFIX/bin/python
PYTHON_SCRIPT=flexllmgen.dist_flex_opt

pgrep -fl python | awk '!/dist_flex_opt\.py/{print $1}' | xargs sudo kill

set -x

mpirun \
  --mca btl_tcp_if_exclude lo,docker0 \
  --mca oob_tcp_if_exclude lo,docker0 \
  --map-by ppr:$N_GPUS:node:pe=$N_CORES_PER_GPU --oversubscribe -H $all_hosts \
  --bind-to core -x OMP_NUM_THREADS=$N_CORES_PER_GPU \
  $PYTHON_EXEC -m $PYTHON_SCRIPT \
    --head-ip $MY_IPADDR \
    --port 7777 \
    --use-mpi \
    --model facebook/opt-1.3b \
    --gpu-batch-size 16 \
    --num-gpu-batches 2 \
    --percent 100 0 100 0 100 0 \
    --comm-device gpu \
    --async-comm



================================================
FILE: benchmark/flexllmgen/bench_dist_single_node.sh
================================================
#!/bin/bash

MY_IPADDR=$(hostname -i)
all_hosts=$MY_IPADDR
N_GPUS=4
N_CORES_PER_GPU=4

PYTHON_EXEC=$CONDA_PREFIX/bin/python
PYTHON_SCRIPT=flexllmgen.dist_flex_opt

pgrep -fl python | awk '!/dist_flex_opt\.py/{print $1}' | xargs sudo kill

set -x

mpirun \
  --mca btl_tcp_if_exclude lo,docker0 \
  --mca oob_tcp_if_exclude lo,docker0 \
  --map-by ppr:$N_GPUS:node:pe=$N_CORES_PER_GPU --oversubscribe -H $all_hosts \
  --bind-to core -x OMP_NUM_THREADS=$N_CORES_PER_GPU \
  $PYTHON_EXEC -m $PYTHON_SCRIPT \
    --head-ip $MY_IPADDR \
    --port 7777 \
    --use-mpi \
    --model facebook/opt-1.3b \
    --gpu-batch-size 16 \
    --percent 100 0 100 0 100 0 \
    --comm-device gpu



================================================
FILE: benchmark/flexllmgen/bench_suite.py
================================================
import argparse
from dataclasses import dataclass

from flexllmgen.utils import run_cmd


@dataclass
class Case:
    command: str
    name: str = ""
    use_page_maga: bool = False


suite_1b3_test = [
    # All GPU
    Case("--model facebook/opt-1.3b --gpu-batch-size 16 --percent 100 0 100 0 100 0 --cut-gen-len 8", "All GPU"),
    # Weight on CPU, cache on GPU
    Case("--model facebook/opt-1.3b --gpu-batch-size 16 --percent 0 100 100 0 100 0 --cut-gen-len 8", "Weight on CPU, cache on GPU"),
    # Weight on GPU, cache on CPU
    Case("--model facebook/opt-1.3b --gpu-batch-size 16 --percent 100 0 0 100 100 0 --cut-gen-len 8 --cpu", "Weight on GPU, cache on CPU"),
    # Weight on CPU, cache on CPU
    Case("--model facebook/opt-1.3b --gpu-batch-size 16 --percent 0 100 0 100 100 0 --cut-gen-len 8 --cpu", "Weight on CPU, cache on CPU"),
    # Weight on disk, cache on GPU
    Case("--model facebook/opt-1.3b --gpu-batch-size 16 --percent 0 0 100 0 100 0 --cut-gen-len 8", "Weight on disk, cache on GPU", True),
    # Weight on GPU, cache on disk
    Case("--model facebook/opt-1.3b --gpu-batch-size 16 --percent 100 0 0 0 100 0 --cut-gen-len 8 --cpu", "Weight on GPU, cache on disk", True),
    # Weight on CPU/GPU (50-50 split), cache on GPU
    Case("--model facebook/opt-1.3b --gpu-batch-size 16 --percent 50 50 100 0 100 0 --cut-gen-len 8", "Weight on both CPU/GPU (50-50 split), cache on GPU"),
    # Weight on GPU, cache on CPU/GPU (50-50 split)
    Case("--model facebook/opt-1.3b --gpu-batch-size 16 --percent 100 0 50 50 100 0 --cut-gen-len 8 --cpu", "Weight on GPU, cache on CPU/GPU (50-50 split)"),
    # Weight on GPU, cache on disk, sparse attention
    Case("--model facebook/opt-1.3b --gpu-batch-size 16 --percent 100 0 0 0 100 0 --cut-gen-len 8 --cpu --attn-sparsity 0.1", "Weight on GPU, cache on disk, sparse attention", True),
    # Weight on GPU, cache on disk, cache quantization
    Case("--model facebook/opt-1.3b --gpu-batch-size 16 --percent 100 0 0 0 100 0 --cut-gen-len 8 --compress-cache", "Weight on GPU, cache on disk, cache quantization", True),
    # All GPU, 2 GPU batches
    Case("--model facebook/opt-1.3b --gpu-batch-size 16 --percent 100 0 100 0 100 0 --cut-gen-len 8 --num-gpu-batches 2", "All GPU, 2 gpu batches"),
]

suite_6b7_1x1 = [
    # seq_len = 256, gen_len = 32
    # 53.29 token/s
    Case("--model facebook/opt-6.7b --path _DUMMY_ --prompt-len 256 --gen-len 32 --percent 100 0 100 0 100 0 --gpu-batch-size 4 --overlap False"),
    # seq_len = 512, gen_len = 32
    Case("--model facebook/opt-6.7b --path _DUMMY_ --percent 100 0 100 0 100 0 --gpu-batch-size 2 --overlap False"),
    # seq_len = 1024, gen_len = 32
    Case("--model facebook/opt-6.7b --path _DUMMY_ --percent 100 0 100 0 100 0 --gpu-batch-size 1 --overlap False --prompt-len 1024"),
]

suite_6b7_1x1_comp = [
    # seq_len = 256, gen_len = 32
    # 56.72 token/s
    Case("--model facebook/opt-6.7b --path _DUMMY_ --prompt-len 256 --gen-len 32 --percent 100 0 100 0 100 0 --gpu-batch-size 128 --overlap False --compress-weight --compress-cache"),
    # seq_len = 512, gen_len = 32
    Case("--model facebook/opt-6.7b --path _DUMMY_ --percent 100 0 100 0 100 0 --gpu-batch-size 72 --overlap False --compress-weight --compress-cache"),
    # seq_len = 1024, gen_len = 32
    Case("--model facebook/opt-6.7b --path _DUMMY_ --percent 100 0 100 0 100 0 --gpu-batch-size 28 --overlap False --compress-weight --compress-cache --prompt-len 1024"),
]

suite_30b_1x1 = [
    # seq_len = 256, gen_len = 32
    # 16.01 token/s
    Case("--model facebook/opt-30b --path _DUMMY_ --prompt-len 256 --gen-len 32 --percent 10 90 0 100 0 100 --gpu-batch-size 160 --num-gpu-batches 2 --cpu --debug fewer_batch", "", False),
    # seq_len = 512, gen_len = 32
    Case("--model facebook/opt-30b --path _DUMMY_ --percent 20 80 0 100 0 100 --gpu-batch-size 48 --num-gpu-batches 3 --cpu --debug fewer_batch"),
    # seq_len = 1024, gen_len = 32
    Case("--model facebook/opt-30b --path _DUMMY_ --percent 4 96 0 100 0 100 --gpu-batch-size 20 --num-gpu-batches 4 --cpu --debug fewer_batch --prompt-len 1024"),
]

suite_30b_1x1_comp = [
    # seq_len = 256, gen_len = 32
    # 16.86 token/s
    Case("--model facebook/opt-30b --path _DUMMY_ --prompt-len 256 --gen-len 32 --percent 0 100 0 100 0 100 --gpu-batch-size 128 --num-gpu-batches 8 --debug fewer_batch --compress-cache"),
    # seq_len = 512, gen_len = 32
    Case("--model facebook/opt-30b --path _DUMMY_ --percent 0 100 0 100 0 100 --gpu-batch-size 64 --num-gpu-batches 8 --debug fewer_batch --compress-cache"),
    # Case("--model facebook/opt-30b --path _DUMMY_ --percent 0 100 0 100 0 100 --gpu-batch-size 16 --num-gpu-batches 20 --debug fewer_batch --compress-cache"),
    # seq_len = 1024, gen_len = 32
    Case("--model facebook/opt-30b --path _DUMMY_ --percent 0 100 0 100 0 100 --gpu-batch-size 20 --num-gpu-batches 12 --debug fewer_batch --compress-cache --prompt-len 1024"),
]

suite_175b_1x1 = [
    # seq_len = 256
    # 1.36 token/s
    Case("--model facebook/opt-175b --path _DUMMY_ --prompt-len 256 --gen-len 32 --pin-weight 0 --percent 0 50 0 0 0 100 --gpu-batch-size 64 --num-gpu-batches 8 --cpu --debug fewer_batch"),
    # seq_len = 512
    Case("--model facebook/opt-175b --path _DUMMY_ --pin-weight 0 --percent 0 50 0 0 0 100 --gpu-batch-size 32 --num-gpu-batches 8 --cpu --debug fewer_batch"),
    # seq_len = 1024
    Case("--model facebook/opt-175b --path _DUMMY_ --pin-weight 0 --percent 0 50 0 0 0 100 --gpu-batch-size 12 --num-gpu-batches 12 --cpu --debug fewer_batch --prompt-len 1024"),
]

suite_175b_1x1_comp = [
    # seq_len = 256
    # 2.26 token/s
    Case("--model facebook/opt-175b --path _DUMMY_ --prompt-len 256 --gen-len 32 --pin-weight 0 --percent 0 100 0 100 0 100 --gpu-batch-size 96 --num-gpu-batches 3 --debug fewer_batch --compress-weight --compress-cache"),
    # seq_len = 512
    Case("--model facebook/opt-175b --path _DUMMY_ --pin-weight 0 --percent 0 100 0 100 0 100 --gpu-batch-size 48 --num-gpu-batches 3 --debug fewer_batch --compress-weight --compress-cache"),
    # seq_len = 1024
    Case("--model facebook/opt-175b --path _DUMMY_ --pin-weight 0 --percent 0 100 0 100 0 100 --gpu-batch-size 12 --num-gpu-batches 4 --debug fewer_batch --compress-weight --compress-cache --prompt-len 1024"),
]

suite_ablation_ds = [
    # 30B
    Case("--model facebook/opt-30b --path _DUMMY_ --percent 0 100 100 0 100 0 --gpu-batch-size 8 --debug fewer_batch"),
    # 175B
    Case("--model facebook/opt-175b --path _DUMMY_ --percent 0 0 100 0 100 0 --gpu-batch-size 2 --debug fewer_batch"),
]

suite_ablation = [
    # 30B

    # 175B
    # no policy search
    Case("--model facebook/opt-175b --path _DUMMY_ --pin-weight 0 --percent 0 50 0 0 0 100 --gpu-batch-size 32 --num-gpu-batches 1 --cpu --debug fewer_batch"),
    # no overlapping
    Case("--model facebook/opt-175b --path _DUMMY_ --pin-weight 0 --percent 0 50 0 0 0 100 --gpu-batch-size 32 --num-gpu-batches 8 --cpu --debug fewer_batch --overlap False"),
    # no cpu compute
    Case("--model facebook/opt-175b --path _DUMMY_ --pin-weight 0 --percent 0 50 0 0 0 100 --gpu-batch-size 32 --num-gpu-batches 8 --debug fewer_batch"),
    # use deepspeed policy
    Case("--model facebook/opt-175b --path _DUMMY_ --percent 0 0 100 0 100 0 --gpu-batch-size 2 --debug fewer_batch"),
]

suite_ablation_policy = [
    # 30B
    Case("--model facebook/opt-30b --path _DUMMY_ --percent 20 80 0 100 0 100 --gpu-batch-size 48 --num-gpu-batches 3 --cpu --debug fewer_batch"),
    Case("--model facebook/opt-30b --path _DUMMY_ --percent 0 100 0 100 0 100 --gpu-batch-size 48 --num-gpu-batches 3 --cpu --debug fewer_batch"),
    Case("--model facebook/opt-30b --path _DUMMY_ --percent 20 80 0 100 0 100 --gpu-batch-size 48 --num-gpu-batches 1 --cpu --debug fewer_batch"),
    Case("--model facebook/opt-30b --path _DUMMY_ --percent 0 50 0 0 0 100 --gpu-batch-size 32 --num-gpu-batches 8 --cpu --debug fewer_batch", use_page_maga=True),
    Case("--model facebook/opt-30b --path _DUMMY_ --percent 0 0 0 0 0 100 --gpu-batch-size 32 --num-gpu-batches 8 --cpu --debug fewer_batch", use_page_maga=True),
    Case("--model facebook/opt-30b --path _DUMMY_ --percent 0 50 0 0 0 100 --gpu-batch-size 32 --num-gpu-batches 1 --cpu --debug fewer_batch", use_page_maga=True),
    Case("--model facebook/opt-30b --path _DUMMY_ --percent 20 80 100 0 100 0 --gpu-batch-size 1 --num-gpu-batches 1 --cpu --debug fewer_batch"),
    Case("--model facebook/opt-30b --path _DUMMY_ --percent 0 50 100 0 100 0 --gpu-batch-size 1 --num-gpu-batches 1 --cpu --debug fewer_batch", use_page_maga=True),

    # 175B
    Case("--model facebook/opt-175b --path _DUMMY_ --pin-weight 0 --percent 20 80 0 100 0 100 --gpu-batch-size 48 --num-gpu-batches 3 --cpu --debug fewer_batch"),
    Case("--model facebook/opt-175b --path _DUMMY_ --pin-weight 0 --percent 0 100 0 100 0 100 --gpu-batch-size 48 --num-gpu-batches 3 --cpu --debug fewer_batch"),
    Case("--model facebook/opt-175b --path _DUMMY_ --pin-weight 0 --percent 20 80 0 100 0 100 --gpu-batch-size 48 --num-gpu-batches 1 --cpu --debug fewer_batch"),
    Case("--model facebook/opt-175b --path _DUMMY_ --pin-weight 0 --percent 0 50 0 0 0 100 --gpu-batch-size 32 --num-gpu-batches 8 --cpu --debug fewer_batch", use_page_maga=True),
    Case("--model facebook/opt-175b --path _DUMMY_ --pin-weight 0 --percent 0 0 0 0 0 100 --gpu-batch-size 32 --num-gpu-batches 8 --cpu --debug fewer_batch", use_page_maga=True),
    Case("--model facebook/opt-175b --path _DUMMY_ --pin-weight 0 --percent 0 50 0 0 0 100 --gpu-batch-size 32 --num-gpu-batches 1 --cpu --debug fewer_batch", use_page_maga=True),
    Case("--model facebook/opt-175b --path _DUMMY_ --pin-weight 0 --percent 20 80 100 0 100 0 --gpu-batch-size 1 --num-gpu-batches 1 --cpu --debug fewer_batch"),
    Case("--model facebook/opt-175b --path _DUMMY_ --pin-weight 0 --percent 0 50 100 0 100 0 --gpu-batch-size 1 --num-gpu-batches 1 --cpu --debug fewer_batch", use_page_maga=True),
]

suite_175b_breakdown = [
    # seq_len = 512
    Case("--model facebook/opt-175b --path _DUMMY_ --pin-weight 0 --percent 0 50 0 0 0 100 --gpu-batch-size 32 --num-gpu-batches 8 --cpu --debug breakdown"),
]

suite_175b_stage = [
    # 1x1 policy
    Case("--model facebook/opt-175b-stage --path _DUMMY_ --pin-weight 0 --percent 0 50 0 0 0 100 --gpu-batch-size 32 --num-gpu-batches 8 --cpu --debug fewer_batch", "", True),

    # full cpu policy
    Case("--model facebook/opt-175b-stage --path _DUMMY_ --pin-weight 0 --percent 0 100 0 100 0 100 --gpu-batch-size 32 --num-gpu-batches 6 --cpu --debug fewer_batch", "", True),
]

suites = {
    "1b3_test": suite_1b3_test,

    "6b7_1x1": suite_6b7_1x1,
    "6b7_1x1_comp": suite_6b7_1x1_comp,

    "30b_1x1": suite_30b_1x1,
    "30b_1x1_comp": suite_30b_1x1_comp,

    "175b_1x1": suite_175b_1x1,
    "175b_1x1_comp": suite_175b_1x1_comp,

    "ablation": suite_ablation,
    "ablation_policy": suite_ablation_policy,
    "175b_breakdown": suite_175b_breakdown,
    "175b_stage": suite_175b_stage,

    "all_1x1": (suite_6b7_1x1 + suite_6b7_1x1_comp +
                suite_30b_1x1 + suite_30b_1x1_comp +
                suite_175b_1x1 + suite_175b_1x1_comp),
}


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("suite", type=str, nargs="+")
    parser.add_argument("--log-file", type=str)
    args = parser.parse_args()

    log_file = args.log_file

    for suite in args.suite:
        cases = suites[suite]
        for case in cases:
            config, name, use_page_maga = case.command, case.name, case.use_page_maga
            cmd = f"python -m flexllmgen.flex_opt {config}"
            if log_file:
                cmd += f" --log-file {args.log_file}"
            if use_page_maga:
                cmd = "bash /usr/local/bin/pagecache-management.sh " + cmd

            if log_file:
                with open(log_file, "a") as f: f.write(f"#### {name}\n```\n{cmd}\n")
            run_cmd(cmd)
            if log_file:
                with open(log_file, "a") as f: f.write(f"```\n")


================================================
FILE: benchmark/hf_ds/README.md
================================================
# Benchmark Baselines

## Install
Install the forks of Huggingface/transformers and Microsoft/DeepSpeed following this [guide](../third_party/README.md).

```
pip3 install accelerate==0.15.0
```
Install dependencies:
```
sudo apt-get install libaio-dev
```

## Run one case

### HuggingFace Accelerate
```
python3 hf_opt.py --model facebook/opt-1.3b --batch-size 16
```

### DeepSpeed 
```
deepspeed --num_gpus 1 hf_opt.py --model facebook/opt-1.3b --batch-size 16
```

## Run multiple cases
```
python3 bench_hf.py 6b7
python3 bench_hf.py 30b
python3 bench_hf.py 175b
```


================================================
FILE: benchmark/hf_ds/bench_all_1x4.sh
================================================
python3 hf_opt.py --num-gpus 4 --model facebook/opt-6.7b --dummy --cut-gen-len 5 --batch-size 16
deepspeed --num_gpus 4 hf_opt.py --model facebook/opt-6.7b --dummy --cut-gen-len 5 --batch-size 48

python3 hf_opt.py --num-gpus 4 --model facebook/opt-30b  --dummy --cut-gen-len 5 --batch-size 8 --cpu
deepspeed --num_gpus 4 hf_opt.py --model facebook/opt-30b  --dummy --cut-gen-len 5 --batch-size 24 --cpu

python3 hf_opt.py --num-gpus 4 --model facebook/opt-175b --dummy --cut-gen-len 5 --batch-size 2 --cpu
deepspeed --num_gpus 4 hf_opt.py --model facebook/opt-175b --dummy --cut-gen-len 5 --batch-size 4 --cpu


================================================
FILE: benchmark/hf_ds/bench_ds_175b_4x1.sh
================================================
deepspeed --num_nodes 4 --num_gpus 1 --master_port 7778 --hostfile hostfile \
    hf_opt.py --model facebook/opt-175b --batch-size 4 --cut-gen-len 5 --dummy --cpu


================================================
FILE: benchmark/hf_ds/bench_ds_30b_1x4.sh
================================================
deepspeed --num_gpus 4 hf_opt.py --model facebook/opt-30b --batch-size 24 --cut-gen-len 5 --cpu --dummy


================================================
FILE: benchmark/hf_ds/bench_ds_30b_4x1.sh
================================================
deepspeed --num_nodes 4 --num_gpus 1 --master_port 7778 --hostfile hostfile \
    hf_opt.py --model facebook/opt-30b --batch-size 24 --cut-gen-len 5 --dummy --cpu


================================================
FILE: benchmark/hf_ds/bench_ds_6.7b_1x4.sh
================================================
deepspeed --num_gpus 4 hf_opt.py --model facebook/opt-6.7b --batch-size 48 --cut-gen-len 5 --dummy


================================================
FILE: benchmark/hf_ds/bench_ds_6.7b_2x1.sh
================================================
deepspeed --num_nodes 2 --num_gpus 1 --master_port 7778 --hostfile hostfile \
    hf_opt.py --model facebook/opt-6.7b --batch-size 16 --cut-gen-len 5 --dummy


================================================
FILE: benchmark/hf_ds/bench_ds_6.7b_4x1.sh
================================================
deepspeed --num_nodes 4 --num_gpus 1 --master_port 7778 --hostfile hostfile \
    hf_opt.py --model facebook/opt-6.7b --batch-size 48 --cut-gen-len 5 --dummy


================================================
FILE: benchmark/hf_ds/bench_hf.py
================================================
import argparse
from dataclasses import dataclass
import time

from flexllmgen.utils import run_cmd


def run_huggingface(model, prompt_len, gen_len, cut_gen_len, batch_size,
                    num_nodes, num_gpus_per_node,
                    use_ds, cpu, disk, dummy, log_file=None, pkl_file=None):
    assert num_nodes == 1
    if use_ds:
        cmd = f"deepspeed --num_gpus {num_gpus_per_node} hf_opt.py "
    else:
        cmd = f"python hf_opt.py --num-gpus {num_gpus_per_node} "

    cmd += (f"--model {model} "
            f"--prompt-len {prompt_len} --gen-len {gen_len} "
            f"--batch-size {batch_size} ")

    if cut_gen_len:
        cmd += f"--cut-gen-len {cut_gen_len} " 
    if cpu:
        cmd += "--cpu "
    if disk:
        cmd += "--disk "
    if dummy:
        cmd += "--dummy "

    if log_file is not None:
        cmd += f"--log-file {log_file} "
    if pkl_file is not None:
        cmd += f"--pkl-file {pkl_file} "

    run_cmd(cmd)


def bench_one_case(case):
    if case.model == "facebook/opt-6.7b":
        cut_gen_len = None
    else:
        cut_gen_len = 5
    dummy = True

    if case.device == "gpu":
        cpu = disk = False
    elif case.device == "cpu":
        cpu, disk = True, False
    elif case.device == "disk":
        cpu, disk = False, True

    use_deepspeed = case.library == "ds"

    run_huggingface(case.model, case.prompt_len, case.gen_len, cut_gen_len,
                    case.batch_size, case.num_nodes, case.num_gpus_per_node,
                    use_ds=use_deepspeed,
                    cpu=cpu, disk=disk, dummy=dummy)


@dataclass
class Case:
    model: str
    library: str
    prompt_len: int
    gen_len: int
    batch_size: int
    device: str
    num_nodes: int = 1
    num_gpus_per_node: int = 1


# For 1 16GB T4
# Seq len = 256
suite_hf_6b7_s256 = [
    Case("facebook/opt-6.7b", "hf", 256, 32, 4, "gpu"),
]
suite_hf_30b_s256 = [
    Case("facebook/opt-30b", "hf", 256, 32, 16, "cpu"),
]
suite_hf_175b_s256 = [
    Case("facebook/opt-175b", "hf", 256, 32, 4, "disk"),
]

suite_ds_6b7_s256 = [
    Case("facebook/opt-6.7b", "ds", 256, 32, 32, "cpu"),
]
suite_ds_30b_s256 = [
    Case("facebook/opt-30b",  "ds", 256, 32, 8, "cpu"),
]
suite_ds_175b_s256 = [
    Case("facebook/opt-175b", "ds", 256, 32, 2, "disk"),
]

# Seq len = 512
suite_hf_6b7_s512 = [
    Case("facebook/opt-6.7b", "hf", 512, 32, 2, "gpu"),
]
suite_hf_30b_s512 = [
    Case("facebook/opt-30b",  "hf", 512, 32, 8, "cpu"),
]
suite_hf_175b_s512 = [
    Case("facebook/opt-175b", "hf", 512, 32, 2, "disk"),
]

suite_ds_6b7_s512 = [
    Case("facebook/opt-6.7b", "ds", 512, 32, 16, "cpu"),
]
suite_ds_30b_s512 = [
    Case("facebook/opt-30b",  "ds", 512, 32, 4, "cpu"),
]
suite_ds_175b_s512 = [
    Case("facebook/opt-175b", "ds", 512, 32, 1, "disk"),
]

# Seq len = 1024
suite_hf_6b7_s1024 = [
    Case("facebook/opt-6.7b", "hf", 1024, 32, 1, "gpu"),
]
suite_hf_30b_s1024 = [
    Case("facebook/opt-30b",  "hf", 1024, 32, 4, "cpu"),
]
suite_hf_175b_s1024 = [
    Case("facebook/opt-175b", "hf", 1024, 32, 1, "disk"),
]

suite_ds_6b7_s1024 = [
    Case("facebook/opt-6.7b", "ds", 1024, 32, 8, "cpu"),
]
suite_ds_30b_s1024 = [
    Case("facebook/opt-30b",  "ds", 1024, 32, 2, "cpu"),
]
suite_ds_175b_s1024 = [
    Case("facebook/opt-175b", "ds", 1024, 32, 1, "disk"),
]

suites = {
    "hf_s256": suite_hf_6b7_s256 + suite_hf_30b_s256 + suite_hf_175b_s256,
    "hf_s512": suite_hf_6b7_s512 + suite_hf_30b_s512 + suite_hf_175b_s512,
    "hf_s1024": suite_hf_6b7_s1024 + suite_hf_30b_s1024 + suite_hf_175b_s1024,

    "ds_s256": suite_ds_6b7_s256 + suite_ds_30b_s256 + suite_hf_175b_s256,
    "ds_s512": suite_ds_6b7_s512 + suite_ds_30b_s512 + suite_ds_175b_s512,
    "ds_s1024": suite_ds_6b7_s1024 + suite_ds_30b_s1024 + suite_ds_175b_s1024,

    "6b7": suite_hf_6b7_s512 + suite_hf_6b7_s1024 + suite_ds_6b7_s512 + suite_ds_6b7_s1024,
    "30b": suite_hf_30b_s512 + suite_hf_30b_s1024 + suite_ds_30b_s512 + suite_ds_30b_s1024,
    "175b": suite_hf_175b_s512 + suite_hf_175b_s1024 + suite_ds_175b_s512 + suite_ds_175b_s1024,
}


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("suite", type=str, nargs="+")
    args = parser.parse_args()

    cases = []
    for suite in args.suite:
        cases += suites[suite]

    for case in cases:
        tic = time.time()
        bench_one_case(case)
        print(f"elapsed: {time.time() - tic:.2f} s")
        time.sleep(2)


================================================
FILE: benchmark/hf_ds/hf_opt.py
================================================
"""
Run OPT with huggingface or deepspeed.

Usage:
deepspeed --num_gpus 1 hf_opt.py --model facebook/opt-1.3b --batch-size 16 --use-deepspeed --cpu-offload

Reference:
https://github.com/huggingface/transformers-bloom-inference/tree/main/bloom-inference-scripts
"""

import argparse
import multiprocessing as mp
import os
import pickle
import time

import numpy as np

from accelerate import (infer_auto_device_map, init_empty_weights,
    load_checkpoint_and_dispatch)
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
from transformers import OPTForCausalLM
import torch

from flexllmgen.timer import timers
from flexllmgen.utils import (GB, project_decode_latency,
    write_benchmark_log)
from flexllmgen.opt_config import (get_opt_config,
    disable_torch_init, disable_hf_opt_init)


def get_filename(model_name, batch_size, prompt_len, gen_len,
                 cpu_offload, disk_offload, num_nodes, num_gpus_per_node,
                 use_deepspeed):
    modelsize = model_name.split('-')[-1]
    if use_deepspeed:
        filename = "ds-"
    else:
        filename = "hf-"
    filename += f"{modelsize}-bs{batch_size}-prompt{prompt_len}-gen{gen_len}-"
    filename += f"n{num_nodes}x{num_gpus_per_node}-"
    if cpu_offload:
        filename += "cpu"
    elif disk_offload:
        filename += "disk"
    else:
        filename += "gpu"
    return filename


def meta_to_cpu(container, dtype=None):
    if isinstance(container, torch.Tensor):
        return torch.empty(*container.shape, dtype=dtype or container.dtype)
    elif isinstance(container, tuple):
        return tuple(meta_to_cpu(x, dtype) for x in container)
    elif isinstance(container, dict):
        return dict((k, meta_to_cpu(v, dtype)) for k, v in container.items())
    else:
        raise ValueError(f"Invalid type: {container}")


def realize_meta_module(module, dtype=None, device=None):
    for name, child in module.named_children():
        realize_meta_module(child, dtype, device)

    keys = list(module._parameters.keys())
    for k in keys:
        v = module._parameters[k]
        if v is not None:
            module._parameters[k] = torch.nn.Parameter(
                torch.empty(*v.shape, dtype=dtype or v.dtype,
                    device=device or v.device))

    keys = list(module._buffers.keys())
    for k in keys:
        v = module._buffers[k]
        assert v is None


def get_model_config(model_name):
    if "175b" in model_name:
        config = AutoConfig.from_pretrained("facebook/opt-66b")
        config.hidden_size = 12288
        config.word_embed_proj_dim = 12288
        config.ffn_dim = 12288 * 4
        config.num_attention_heads = 96
        config.num_hidden_layers = 96
    else:
        config = AutoConfig.from_pretrained(model_name)

    return config


def get_ds_opt_model(model_name, dtype, cpu_offload, disk_offload, offload_dir,
                     dummy_weights):
    import deepspeed
    import torch.distributed as dist
    from transformers.deepspeed import HfDeepSpeedConfig

    config = get_model_config(model_name)
    hidden_size = config.hidden_size
    deepspeed.init_distributed("nccl")
    rank = dist.get_rank()
    pin_memory = bool(args.pin_memory)

    ds_config = {
        "fp16": {
            "enabled": dtype == torch.float16,
        },
        "bf16": {
            "enabled": dtype == torch.bfloat16,
        },
        "zero_optimization": {
            "stage": 3,
            "stage3_prefetch_bucket_size": hidden_size * hidden_size,
            "stage3_param_persistence_threshold": 0,
        },
        "steps_per_print": 2000,
        "train_batch_size": args.batch_size,
        "wall_clock_breakdown": False,
    }

    if cpu_offload:
        ds_config["zero_optimization"]["offload_param"] = dict(
            device="cpu", pin_memory=pin_memory)

    if disk_offload:
        ds_config["zero_optimization"]["offload_param"] = dict(
            device="nvme",
            pin_memory=True,
            nvme_path=offload_dir,
            buffer_count=5,
            buffer_size=2 * GB,
        )
        ds_config["aio"] = {
          "block_size": 1048576,
          "queue_depth": 8,
          "thread_count": 1,
          "single_submit": False,
          "overlap_events": True,
        }

    dschf = HfDeepSpeedConfig(ds_config)

    model = OPTForCausalLM.from_pretrained(
        dummy_weights or model_name, torch_dtype=dtype)
    model = model.eval()
    ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0]
    ds_engine.module.eval()
    model = ds_engine.module

    return model


def get_hf_opt_model(model_name, dtype, cpu_offload, disk_offload, offload_dir,
                     num_gpus, dummy_weights):
    if num_gpus == 1 and dtype != torch.int8:
        # Here we use a custom device_map instead of device_map == "auto"
        # becase we want to offload as many as possible weights out of GPU
        # to allow a larger batch size.
        if cpu_offload:
            # NOTE: We must put some weights on GPU. Otherwise, huggingface reports errors.
            device_map = {
                "model.decoder.embed_tokens.weight": 0,
                "model.decoder.embed_positions.weight": 0,
                "model.decoder.final_layer_norm": "cpu",
                "model.decoder.layers": "cpu",
                "lm_head.weight": 0,
            }
        elif disk_offload:
            device_map = {
                "model.decoder.embed_tokens.weight": 0,
                "model.decoder.embed_positions.weight": 0,
                "model.decoder.final_layer_norm": "disk",
                "model.decoder.layers": "disk",
                "lm_head.weight": 0,
            }
        else:
            device_map = None
        max_memory = None
    else:
        # Here we use device_map == "auto", but set a low `max_memory` threshold
        # becase we want to offload as many as possible weights out of GPU
        # to allow a larger batch size.
        device_map = "auto"
        if cpu_offload:
            # `max_memory` should be larger than the embedding.
            # We use 2GB here because the embeding of opt-175b is 1.2GB.
            max_memory = {k: "2GB" for k in range(num_gpus)}
        elif disk_offload:
            max_memory = {k: "2GB" for k in range(num_gpus)}
        else:
            max_memory = {k: "14GB" for k in range(num_gpus)}
        max_memory["cpu"] = "160GB"

    if dtype == torch.int8:
        kwargs = {"load_in_8bit": True}
    else:
        kwargs = {"torch_dtype": dtype}

    disable_torch_init()
    model = OPTForCausalLM.from_pretrained(dummy_weights or model_name,
        device_map=device_map, max_memory=max_memory,
        offload_folder=offload_dir, **kwargs)
    if device_map is None:
        model.cuda()

    model.eval()
    return model


def run_generation(model_name, batch_size, prompt_len, gen_len, cut_gen_len,
                   cpu_offload, disk_offload, offload_dir, use_int8,
                   num_nodes, num_gpus_per_node, use_deepspeed, dummy,
                   output_file, pkl_file, no_log, verbose):
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        model_name.replace("175b", "66b"), padding_side="left")

    # Load model
    if use_int8:
        dtype = torch.int8
    else:
        dtype = torch.float16

    if dummy:
        config = get_model_config(model_name)
        filename = os.path.join(offload_dir,
            f"{model_name.replace('/', '-')}-hf-weights/")
        if not os.path.exists(filename):
            print("create dummy weights")
            with init_empty_weights():
                model = OPTForCausalLM(config)
            model.save_pretrained(filename,
                state_dict=meta_to_cpu(model.state_dict(), torch.float16))
        dummy_weights = filename
    else:
        dummy_weights = None

    print("load model")
    if use_deepspeed:
        model = get_ds_opt_model(model_name, dtype, cpu_offload, disk_offload,
            offload_dir, dummy_weights)
    else:
        model = get_hf_opt_model(model_name, dtype, cpu_offload, disk_offload,
            offload_dir, num_gpus_per_node, dummy_weights)

    # Run generation
    execute_gen_len = cut_gen_len if cut_gen_len else gen_len
    if use_deepspeed:
        prompts = ["Paris is the capital city of"] * (batch_size // WORLD_SIZE)
    else:
        prompts = ["Paris is the capital city of"] * batch_size
    input_ids = tokenizer(prompts, return_tensors="pt",
                          padding="max_length",
                          max_length=prompt_len).input_ids.cuda()

    # Warmup
    print("wamup")
    generate_kwargs_warmup = dict(max_new_tokens=1, do_sample=False)
    with torch.no_grad():
        output_ids = model.generate(input_ids=input_ids, **generate_kwargs_warmup)

    # Run
    print("benchmark")
    timers("generate-forward").reset()
    generate_kwargs = dict(max_new_tokens=execute_gen_len, do_sample=False)
    with torch.no_grad():
        output_ids = model.generate(input_ids=input_ids, **generate_kwargs)
    costs = timers("generate-forward").costs

    if use_deepspeed and args.local_rank != 0:
        return

    # Log output
    prefill_latency = costs[0]
    prefill_throughput = batch_size * prompt_len / prefill_latency
    if cut_gen_len:  # project latency of cut_gen_len to gen_len
        decode_latency = project_decode_latency(costs, prompt_len, gen_len)
    else:
        decode_latency = sum(costs[1:])
    decode_throughput = batch_size * (gen_len - 1) / max(decode_latency, 1e-10)
    num_generated_tokens = batch_size * gen_len
    total_latency = prefill_latency + decode_latency
    total_throughput = num_generated_tokens / total_latency
    gpu_peak_mem = torch.cuda.max_memory_allocated(torch.device("cuda"))
    out_str = ""

    if verbose >= 2:
        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
        show_str = "Outputs:\n" + 70 * '-' + "\n"
        for i in [0, len(outputs)-1]:
            show_str += f"{i}: {outputs[i]}\n"
            show_str += 70 * '-' + "\n"
        print(show_str)

        # Check lengths
        input_lens = [len(x) for x in input_ids]
        output_lens = [len(x) for x in output_ids]
        assert all(x == prompt_len for x in input_lens)
        assert all(x == prompt_len + execute_gen_len for x in output_lens)

    if args.log_file == "auto":
        filename = get_filename(model_name, batch_size, prompt_len,
            gen_len, cpu_offload, disk_offload, num_nodes,
            num_gpus_per_node, use_deepspeed) + ".log"
    else:
        filename = args.log_file

    projected = bool(cut_gen_len)
    opt_config = get_opt_config(args.model)
    cache_size = opt_config.cache_bytes(batch_size, prompt_len + gen_len)
    hidden_size = opt_config.hidden_bytes(batch_size, prompt_len + gen_len)
    log_str = write_benchmark_log(filename,
        opt_config.model_bytes(), cache_size, hidden_size,
        gpu_peak_mem, projected, prefill_latency, prefill_throughput,
        decode_latency, decode_throughput, total_latency, total_throughput)
    if verbose >= 1:
        print(log_str)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", type=str, default="facebook/opt-1.3b")
    parser.add_argument("--dummy", action="store_true",
        help="Use dummy weights for benchmark purposes.")
    parser.add_argument("--batch-size", type=int, default=1)
    parser.add_argument("--prompt-len", type=int, default=512)
    parser.add_argument("--gen-len", type=int, default=32)
    parser.add_argument("--cut-gen-len", type=int)
    parser.add_argument("--local_rank", type=int)
    parser.add_argument("--num-gpus", type=int, default=1)
    parser.add_argument("--pin-memory", type=int, default=1)
    parser.add_argument("--cpu-offload", action="store_true")
    parser.add_argument("--disk-offload", action="store_true")
    parser.add_argument("--offload-dir", type=str, default="~/flexllmgen_offload_dir")
    parser.add_argument("--int8", action="store_true")

    parser.add_argument("--log-file", type=str, default="auto")
    parser.add_argument("--pkl-file", type=str, default="auto")
    parser.add_argument("--no-log", action="store_true")
    parser.add_argument("--verbose", type=int, default=2)
    args = parser.parse_args()

    assert not (args.no_log and
                (args.output_file != "auto" or args.pkl_file != "auto"))

    if args.local_rank is None:  # huggingface
        use_deepspeed = False
        num_gpus_per_node = args.num_gpus
        num_nodes = 1
    else:  # deepspeed
        use_deepspeed = True
        WORLD_SIZE = int(os.getenv("WORLD_SIZE"))
        num_gpus_per_node = torch.cuda.device_count()
        num_nodes = WORLD_SIZE // num_gpus_per_node

    run_generation(args.model, args.batch_size, args.prompt_len, args.gen_len,
                   args.cut_gen_len, args.cpu_offload, args.disk_offload,
                   os.path.abspath(os.path.expanduser(args.offload_dir)),
                   args.int8, num_nodes, num_gpus_per_node, use_deepspeed,
                   args.dummy, args.log_file, args.pkl_file,
                   args.no_log, args.verbose)


================================================
FILE: benchmark/hf_ds/hostfile
================================================
172.31.19.249 slots=1
172.31.29.45  slots=1


================================================
FILE: benchmark/petals/README.md
================================================
# Running Petals benchmarks

This guide contains the steps necessary to reproduce experiments in Section 6.3 and Table 17 of the paper.

## Requirements

To start the benchmarks, you will need a Linux-based environment with an installed [version of Petals that supports OPT-sized models](https://github.com/bigscience-workshop/petals/tree/test_opt_serving).
We provide a Docker image with such an environment at [mrbn/petals:test_opt_serving](https://hub.docker.com/layers/mrbn/petals/test_opt_serving/images/sha256-5c38f459f1b42fc655f85523b78c22a1b3c05139d9bd03cd5e2a395e8d73b7aa?context=explore) on DockerHub.
You will also need to install the [Traffic Control](https://wiki.debian.org/TrafficControl) (`tc`) utility for controlling the network connection speed.

## Setting up a private swarm

First, you need to set up a coordinator peer using [this part](https://github.com/bigscience-workshop/petals/wiki/Launch-your-own-swarm#step-1-set-up-the-network) of the Petals private swarm guide.
If you do not need a persistent identifier, a single command might suffice:

```
hivemind-dht
```
After the DHT node is started, it will give you an address of this peer. 
You will need it for connecting all other servers and clients to each other: following parts of the guide assume it is stored in the `$INITIAL_PEER_ID` environment variable.

Then, on each of the machines with GPUs, you need to run two commands that set the network throughput and latency and launch the actual server. 
We assume OPT-30B model in these commands (Section 6.3); replace the model names with `facebook/opt-6.7b` or `facebook/opt-175b` if necessary.

The commands for each setup are below. 
If you are not using Docker, simply remove everything before `python -m petals.cli.run_server`.
Also, change `$NETWORK_INTERFACE` in the first command to the external [network interface](https://www.cyberciti.biz/faq/linux-list-network-interfaces-names-command/) used by your GPU nodes.

* 10ms latency, 1 Gbit throughput
    ```
    tc qdisc add dev $NETWORK_INTERFACE root netem delay 10ms rate 1Gbit limit 225000
    sudo docker run --net host --ipc host --gpus all --name petals --volume petals-cache:/cache \
      --rm mrbn/petals:test_opt_serving python -m petals.cli.run_server facebook/opt-30b --initial_peers $INITIAL_PEER_ID
    ```
* 10ms latency, 100 Mbit throughput
    ```
    tc qdisc add dev $NETWORK_INTERFACE root netem delay 10ms rate 0.1Gbit limit 22500
    sudo docker run --net host --ipc host --gpus all --name petals --volume petals-cache:/cache \
      --rm mrbn/petals:test_opt_serving python -m petals.cli.run_server facebook/opt-30b --initial_peers $INITIAL_PEER_ID
    ```
* 100ms latency, 100Mbit throughput
    ```
    tc qdisc add dev $NETWORK_INTERFACE root netem delay 100ms rate 0.1Gbit limit 2250000
    sudo docker run --net host --ipc host --gpus all --name petals --volume petals-cache:/cache \
      --rm mrbn/petals:test_opt_serving python -m petals.cli.run_server facebook/opt-30b --initial_peers $INITIAL_PEER_ID
    ```

## Running benchmarks

Finally, on another GPU-enabled machine (preferably with at least 2 GPUs) with the same environment, run the following command:

```
python run_opt_requests.py --initial_peers $INITIAL_PEER --prefix facebook/opt-30b \
  -b 1 --num-micro-batches 2 --num-processes 6 --output out_30b.tsv
```

This script will produce a TSV file with the following columns:
* Microbatch size
* Number of microbatches
* Number of processes
* Prefix length
* Output sequence length
* Total throughput (**needs to be divided by the number of GPU peers in the swarm**)
* Average latency per token

Using this data, you can reproduce the results from our paper by either displaying results for several batch sizes in a table (Table 17) or plotting the throughput/latency trends with respect to the number of generated tokens.

================================================
FILE: benchmark/petals/run_opt_requests.py
================================================
import time
from argparse import ArgumentParser
from statistics import mean

import torch
from petals import DistributedBloomConfig, DistributedBloomForCausalLM
from torch.multiprocessing import Process, Event, Queue
from transformers import AutoTokenizer, BloomConfig, OPTConfig


def _patch_bloom_config(bloom_config: BloomConfig, opt_config: OPTConfig):
    bloom_config.hidden_size = opt_config.hidden_size
    bloom_config.n_head = opt_config.num_attention_heads
    bloom_config.n_layer = opt_config.num_hidden_layers
    bloom_config.vocab_size = opt_config.vocab_size


def client_process(
    finished_warmup,
    can_start,
    config_bloom,
    num_micro_batches,
    batch_size,
    sequence_length,
    max_tokens,
    process_index,
    queue: Queue,
) -> None:
    torch.set_num_threads(1)
    torch.cuda.set_device(process_index % torch.cuda.device_count())

    tokenizer = AutoTokenizer.from_pretrained("facebook/opt-30b")
    inputs = torch.randint(0, tokenizer.vocab_size, size=(batch_size, sequence_length), device="cuda")

    model = DistributedBloomForCausalLM(config_bloom)
    model.cuda()

    # warmup
    model.generate(inputs, max_new_tokens=1, do_sample=False)
    finished_warmup.set()
    can_start.wait()

    for _ in range(num_micro_batches):
        start = time.monotonic()
        model.generate(inputs, max_new_tokens=max_tokens, do_sample=False)
        end = time.monotonic()
        queue.put(end - start)


def main():
    parser = ArgumentParser()
    parser.add_argument(
        "--initial_peers",
        nargs="*",
        help="Multiaddrs of the peers that will welcome you into the existing DHT. "
             "Example: /ip4/203.0.113.1/tcp/31337/p2p/XXXX /ip4/203.0.113.2/tcp/7777/p2p/YYYY",
    )
    parser.add_argument('--prefix', default="facebook/opt-175b", help="Prefix of the model.")
    parser.add_argument('--batch-size', "-b", default=1, type=int)
    parser.add_argument('--num-micro-batches', default=1, type=int)
    parser.add_argument('--num-processes', default=1, type=int)
    parser.add_argument('--output', required=True)
    args = parser.parse_args()

    config_bloom = DistributedBloomConfig.from_pretrained("bigscience/bloom-petals")
    config_bloom.initial_peers = args.initial_peers

    if args.prefix == "facebook/opt-6.7b":
        config_bloom.dht_prefix = "opt6b"
    else:
        config_bloom.dht_prefix = args.prefix

    if args.prefix == "facebook/opt-175b":
        config_bloom.hidden_size = 12288
        config_bloom.n_layer = 96
        config_bloom.n_head = 96
        config_bloom.vocab_size = 50272
    else:
        config_opt = OPTConfig.from_pretrained(args.prefix)
        _patch_bloom_config(config_bloom, config_opt)

    for sequence_length in 256, 512, 1024:
        run_bench(args=args, sequence_length=sequence_length, max_tokens=32, config_bloom=config_bloom)

    if args.prefix == "facebook/opt-30b":
        for max_tokens in range(33):
            run_bench(args=args, sequence_length=512, max_tokens=max_tokens, config_bloom=config_bloom)


def run_bench(args, sequence_length, max_tokens, config_bloom):
    queue = Queue()
    processes = []
    warmup_events = []
    can_start = Event()
    for i in range(args.num_processes):
        print("create process", i)
        warmup_event = Event()
        proc = Process(target=client_process,
                       args=(warmup_event, can_start, config_bloom, args.num_micro_batches, args.batch_size,
                             sequence_length, max_tokens, i, queue)
                       )
        proc.start()
        processes.append(proc)
        warmup_events.append(warmup_event)
    for event in warmup_events:
        event.wait()
    can_start.set()
    start = time.monotonic()
    for i, proc in enumerate(processes):
        print("join process", i)
        proc.join()
    end = time.monotonic()
    latencies = []
    while not queue.empty():
        latencies.append(queue.get())
    print("total time", end - start)
    total_tokens = args.batch_size * args.num_micro_batches * args.num_processes * max_tokens
    print("total tokens", total_tokens)

    throughput = total_tokens / (end - start)
    print("throughput", throughput)
    latency = mean(latencies)
    print("average latency", latency)
    with open(args.output, "a") as f:
        print("\t".join(
            map(str,
                [args.batch_size, args.num_micro_batches, args.num_processes, sequence_length, max_tokens,
                 throughput, latency]
                )), file=f)


if __name__ == "__main__":
    main()


================================================
FILE: benchmark/third_party/DeepSpeed/.clang-format
================================================
---
# Refer to the following link for the explanation of each params:
#   http://releases.llvm.org/8.0.0/tools/clang/docs/ClangFormatStyleOptions.html
Language: Cpp
# BasedOnStyle: Google
AccessModifierOffset: -4
AlignAfterOpenBracket: Align
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
AlignEscapedNewlines: Left
AlignOperands: true
AlignTrailingComments: true
AllowAllParametersOfDeclarationOnNextLine: false
AllowShortBlocksOnASingleLine: true
AllowShortCaseLabelsOnASingleLine: true
AllowShortFunctionsOnASingleLine: All
AllowShortIfStatementsOnASingleLine: true
AllowShortLoopsOnASingleLine: true
# This is deprecated
AlwaysBreakAfterDefinitionReturnType: None
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: true
BinPackArguments:  false
BinPackParameters: false
BraceWrapping:
  AfterClass:            false
  AfterControlStatement: false
  AfterEnum:             false
  AfterFunction:         false
  AfterNamespace:        false
  AfterObjCDeclaration:  false
  AfterStruct:           false
  AfterUnion:            false
  AfterExternBlock:      false
  BeforeCatch:           false
  BeforeElse:            false
  IndentBraces:          false
  # disabling the below splits, else, they'll just add to the vertical length of source files!
  SplitEmptyFunction: false
  SplitEmptyRecord: false
  SplitEmptyNamespace: false
BreakBeforeBinaryOperators: None
BreakBeforeBraces: WebKit
BreakBeforeInheritanceComma: false
BreakInheritanceList: BeforeColon
BreakBeforeTernaryOperators: true
BreakConstructorInitializersBeforeComma: false
BreakConstructorInitializers: BeforeColon
BreakAfterJavaFieldAnnotations: false
BreakStringLiterals: true
ColumnLimit: 100
CommentPragmas: '^ IWYU pragma:'
CompactNamespaces: false
ConstructorInitializerAllOnOneLineOrOnePerLine: true
# Kept the below 2 to be the same as `IndentWidth` to keep everything uniform
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
DerivePointerAlignment: false
DisableFormat: false
ExperimentalAutoDetectBinPacking: false
FixNamespaceComments: true
ForEachMacros:
  - foreach
  - Q_FOREACH
  - BOOST_FOREACH
IncludeBlocks: Preserve
IncludeCategories:
  - Regex:           '^<ext/.*\.h>'
    Priority:        2
  - Regex:           '^<.*\.h>'
    Priority:        1
  - Regex:           '^<.*'
    Priority:        2
  - Regex:           '.*'
    Priority:        3
IncludeIsMainRegex: '([-_](test|unittest))?$'
IndentCaseLabels: true
IndentPPDirectives: None
IndentWidth:     4
IndentWrappedFunctionNames: false
JavaScriptQuotes: Leave
JavaScriptWrapImports: true
KeepEmptyLinesAtTheStartOfBlocks: false
MacroBlockBegin: ''
MacroBlockEnd:   ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBinPackProtocolList: Never
ObjCBlockIndentWidth: 4
ObjCSpaceAfterProperty: false
ObjCSpaceBeforeProtocolList: true
PenaltyBreakAssignment: 4
PenaltyBreakBeforeFirstCallParameter: 1
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyBreakTemplateDeclaration: 10
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 200
PointerAlignment: Left
RawStringFormats:
  - Language: Cpp
    Delimiters:
      - cc
      - CC
      - cpp
      - Cpp
      - CPP
      - 'c++'
      - 'C++'
    CanonicalDelimiter: ''
  - Language: TextProto
    Delimiters:
      - pb
      - PB
      - proto
      - PROTO
    EnclosingFunctions:
      - EqualsProto
      - EquivToProto
      - PARSE_PARTIAL_TEXT_PROTO
      - PARSE_TEST_PROTO
      - PARSE_TEXT_PROTO
      - ParseTextOrDie
      - ParseTextProtoOrDie
    CanonicalDelimiter: ''
    BasedOnStyle: google
# Enabling comment reflow causes doxygen comments to be messed up in their formats!
ReflowComments: true
SortIncludes: true
SortUsingDeclarations: true
SpaceAfterCStyleCast: false
SpaceAfterTemplateKeyword: true
SpaceBeforeAssignmentOperators: true
SpaceBeforeCpp11BracedList: false
SpaceBeforeCtorInitializerColon: true
SpaceBeforeInheritanceColon: true
SpaceBeforeParens: ControlStatements
SpaceBeforeRangeBasedForLoopColon: true
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 2
SpacesInAngles: false
SpacesInContainerLiterals: true
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
Standard: Cpp11
StatementMacros:
  - Q_UNUSED
  - QT_REQUIRE_VERSION
# Be consistent with indent-width, even for people who use tab for indentation!
TabWidth: 4
UseTab: Never


================================================
FILE: benchmark/third_party/DeepSpeed/.github/ISSUE_TEMPLATE/compression_bug_report.md
================================================
---
name: Bug report (compression)
about: Create a DeepSpeed compression related issue to help us improve
title: "[BUG]"
labels: bug,compression
assignees: ''

---

**Describe the bug**
A clear and concise description of what the bug is.

**To Reproduce**
Steps to reproduce the behavior:
1. Go to '...'
2. Click on '....'
3. Scroll down to '....'
4. See error

**Expected behavior**
A clear and concise description of what you expected to happen.

**ds_report output**
Please run `ds_report` to give us details about your setup.

**Screenshots**
If applicable, add screenshots to help explain your problem.

**System info (please complete the following information):**
 - OS: [e.g. Ubuntu 18.04]
 - GPU count and types [e.g. two machines with x8 A100s each]
 - Interconnects (if applicable) [e.g., two machines connected with 100 Gbps IB]
 - Python version
 - Any other relevant info about your setup

**Launcher context**
Are you launching your experiment with the `deepspeed` launcher, MPI, or something else?

**Docker context**
Are you using a specific docker image that you can share?

**Additional context**
Add any other context about the problem here.


================================================
FILE: benchmark/third_party/DeepSpeed/.github/ISSUE_TEMPLATE/feature_request.md
================================================
---
name: Feature request
about: Suggest an idea for this project
title: "[REQUEST]"
labels: enhancement
assignees: ''

---

**Is your feature request related to a problem? Please describe.**
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]

**Describe the solution you'd like**
A clear and concise description of what you want to happen.

**Describe alternatives you've considered**
A clear and concise description of any alternative solutions or features you've considered.

**Additional context**
Add any other context or screenshots about the feature request here.


================================================
FILE: benchmark/third_party/DeepSpeed/.github/ISSUE_TEMPLATE/inference_bug_report.md
================================================
---
name: Bug report (inference)
about: Create a DeepSpeed inference related issue to help us improve
title: "[BUG]"
labels: bug,inference
assignees: ''

---

**Describe the bug**
A clear and concise description of what the bug is.

**To Reproduce**
Steps to reproduce the behavior:
1. Simple inference script to reproduce
2. What packages are required and their versions
3. How to run the script
4. ...

**Expected behavior**
A clear and concise description of what you expected to happen.

**ds_report output**
Please run `ds_report` to give us details about your setup.

**Screenshots**
If applicable, add screenshots to help explain your problem.

**System info (please complete the following information):**
 - OS: [e.g. Ubuntu 18.04]
 - GPU count and types [e.g. two machines with x8 A100s each]
 - (if applicable) what [DeepSpeed-MII](https://github.com/microsoft/deepspeed-mii) version are you using
 - (if applicable) Hugging Face Transformers/Accelerate/etc. versions
 - Python version
 - Any other relevant info about your setup

**Docker context**
Are you using a specific docker image that you can share?

**Additional context**
Add any other context about the problem here.


================================================
FILE: benchmark/third_party/DeepSpeed/.github/ISSUE_TEMPLATE/training_bug_report.md
================================================
---
name: Bug report (training)
about: Create a DeepSpeed training related issue to help us improve
title: "[BUG]"
labels: bug,training
assignees: ''

---

**Describe the bug**
A clear and concise description of what the bug is.

**To Reproduce**
Steps to reproduce the behavior:
1. Go to '...'
2. Click on '....'
3. Scroll down to '....'
4. See error

**Expected behavior**
A clear and concise description of what you expected to happen.

**ds_report output**
Please run `ds_report` to give us details about your setup.

**Screenshots**
If applicable, add screenshots to help explain your problem.

**System info (please complete the following information):**
 - OS: [e.g. Ubuntu 18.04]
 - GPU count and types [e.g. two machines with x8 A100s each]
 - Interconnects (if applicable) [e.g., two machines connected with 100 Gbps IB]
 - Python version
 - Any other relevant info about your setup

**Launcher context**
Are you launching your experiment with the `deepspeed` launcher, MPI, or something else?

**Docker context**
Are you using a specific docker image that you can share?

**Additional context**
Add any other context about the problem here.


================================================
FILE: benchmark/third_party/DeepSpeed/.github/workflows/amd.yml
================================================
name: amd

on:
  push:
    branches:
      - 'master'
      - 'staging**'
    paths-ignore:
      - 'docs/**'
  pull_request:
    paths-ignore:
      - 'docs/**'

concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

jobs:
  unit-tests:
    # The type of runner that the job will run on
    runs-on: [self-hosted, amd]

    # Steps represent a sequence of tasks that will be executed as part of the job
    steps:
      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
      - uses: actions/checkout@v2

      # Runs a single command using the runners shell
      - name: environment
        run: |
          echo "JobID: $AISC_NODE_INSTANCE_ID"
          rocm-smi --showhw
          which python
          python --version
          which hipcc
          hipcc --version
          pip install --upgrade pip
          pip uninstall --yes torch torchvision triton
          pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/rocm5.1.1
          python -c "import torch; print('torch:', torch.__version__, torch)"
          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
          sudo apt-get update
          sudo apt-get install -y libaio-dev

      - name: Install transformers
        run: |
          git clone https://github.com/huggingface/transformers
          cd transformers
          # if needed switch to the last known good SHA until transformers@master is fixed
          # git checkout 1cc453d33
          git rev-parse --short HEAD
          pip install .

      # Runs a set of commands using the runners shell
      - name: Install deepspeed
        run: |
          pip install .[dev,1bit,autotuning]
          #python -c "from deepspeed.env_report import cli_main; cli_main()"
          ds_report

      - name: Python environment
        run: |
          pip list

      # Runs a set of commands using the runners shell
      - name: Unit tests
        run: |
          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
          cd tests
          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked -n 4 --verbose unit/
          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/


================================================
FILE: benchmark/third_party/DeepSpeed/.github/workflows/formatting.yml
================================================
name: Formatting

on:
  push:
    branches:
      - 'master'
      - 'staging**'
  pull_request:
    branches:
      '**'

concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

jobs:

  # formatting and basic install on cpu-only machine
  formatting:
    runs-on: ubuntu-20.04

    steps:
      - uses: actions/checkout@v2

      - name: environment
        run: |
          which python
          python --version

      - name: Install deepspeed
        run: |
          pip install .[dev,autotuning]
          ds_report

      - name: Formatting checks
        run: |
           pre-commit run --all-files


================================================
FILE: benchmark/third_party/DeepSpeed/.github/workflows/nv-accelerate-v100.yml
================================================
name: nv-accelerate-v100

on:
  push:
    branches:
      - 'master'
      - 'staging**'
    paths-ignore:
      - 'docs/**'
  pull_request:
    paths-ignore:
      - 'docs/**'

concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

jobs:
  unit-tests:
    runs-on: [self-hosted, nvidia, cu111, v100]

    steps:
      - uses: actions/checkout@v2

      - name: environment
        run: |
          echo "JobID: $AISC_NODE_INSTANCE_ID"
          nvidia-smi
          which python
          python --version
          which nvcc
          nvcc --version
          pip install --upgrade pip
          pip uninstall --yes torch torchvision triton
          pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu111
          python -c "import torch; print('torch:', torch.__version__, torch)"
          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"

      - name: Install deepspeed
        run: |
          pip uninstall --yes deepspeed
          pip install .[dev,autotuning]
          ds_report

      - name: Python environment
        run: |
          pip list

      - name: HF Accelerate tests
        run: |
          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
          git clone https://github.com/huggingface/accelerate
          cd accelerate
          # tmp fix
          git checkout 5f4ba04628eeea14f9d248ab0e54399899503532
          git rev-parse --short HEAD
          # installing dependencies
          pip install .[testing]
          # force protobuf version due to issues
          pip install "protobuf<4.21.0"
          # tmp fix: force newer datasets version
          #pip install "datasets>=2.0.0"
          pip list
          HF_DATASETS_CACHE=/blob/datasets_cache/ TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose tests/deepspeed


================================================
FILE: benchmark/third_party/DeepSpeed/.github/workflows/nv-inference.yml
================================================
name: nv-inference

on:
  push:
    branches:
      - 'master'
      - 'staging**'
    paths-ignore:
      - 'docs/**'
  pull_request:
    paths-ignore:
      - 'docs/**'

concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

jobs:
  unit-tests:
    runs-on: [self-hosted, nvidia, cu116, v100]

    steps:
      - uses: actions/checkout@v2

      - name: environment
        run: |
          echo "JobID: $AISC_NODE_INSTANCE_ID"
          nvidia-smi
          which python
          python --version
          which nvcc
          nvcc --version
          pip install --upgrade pip
          pip uninstall --yes torch torchvision triton
          pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
          python -c "import torch; print('torch:', torch.__version__, torch)"
          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"

      - name: Install transformers
        run: |
          git clone https://github.com/huggingface/transformers
          cd transformers
          git rev-parse --short HEAD
          pip uninstall --yes transformers
          pip install .

      - name: Install deepspeed
        run: |
          pip uninstall --yes deepspeed
          pip install .[dev,1bit,autotuning,inf]
          ds_report

      - name: Python environment
        run: |
          pip list

      - name: Unit tests
        run: |
          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
          cd tests
          TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose -m 'seq_inference' unit/ --torch_ver="1.13" --cuda_ver="11.6"
          TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked -n 4 --verbose -m 'inference' unit/ --torch_ver="1.13" --cuda_ver="11.6"


================================================
FILE: benchmark/third_party/DeepSpeed/.github/workflows/nv-lightning-v100.yml
================================================
name: nv-lightning-v100

on:
  push:
    branches:
      - 'master'
      - 'staging**'
    paths-ignore:
      - 'docs/**'
  pull_request:
    paths-ignore:
      - 'docs/**'

concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

jobs:
  unit-tests:
    runs-on: [self-hosted, nvidia, cu111, v100]

    steps:
      - uses: actions/checkout@v2

      - name: environment
        run: |
          echo "JobID: $AISC_NODE_INSTANCE_ID"
          nvidia-smi
          which python
          python --version
          which nvcc
          nvcc --version
          pip install --upgrade pip
          pip uninstall --yes torch torchvision
          pip install torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html
          python -c "import torch; print('torch:', torch.__version__, torch)"
          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"

      - name: Install deepspeed
        run: |
          pip uninstall --yes deepspeed
          pip install .[dev,autotuning]
          ds_report

      - name: Python environment
        run: |
          pip list

      - name: PyTorch Lightning Tests
        run: |
          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
          pip uninstall --yes pytorch-lightning
          pip install pytorch-lightning
          pip install "protobuf<4.21.0"
          cd tests
          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose lightning/


================================================
FILE: benchmark/third_party/DeepSpeed/.github/workflows/nv-mii.yml
================================================
name: nv-mii

on:
  push:
    branches:
      - 'master'
      - 'staging**'
    paths-ignore:
      - 'docs/**'
  pull_request:
    paths-ignore:
      - 'docs/**'

concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

jobs:
  unit-tests:
    runs-on: [self-hosted, nvidia, cu116, v100]

    steps:
      - uses: actions/checkout@v2

      - name: environment
        run: |
          echo "JobID: $AISC_NODE_INSTANCE_ID"
          nvidia-smi
          which python
          python --version
          which nvcc
          nvcc --version
          pip install --upgrade pip
          pip uninstall --yes torch torchvision triton
          pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
          python -c "import torch; print('torch:', torch.__version__, torch)"
          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"

      - name: Install MII
        run: |
          pip uninstall --yes deepspeed deepspeed-mii transformers
          pip install .[dev]
          pip install git+https://github.com/huggingface/transformers.git

      - name: Python environment
        run: |
          pip list

      - name: Unit tests
        run: |
          git clone https://github.com/microsoft/DeepSpeed-MII.git
          cd DeepSpeed-MII
          pip install .[dev]
          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
          cd tests
          TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m "CPU or local" ./


================================================
FILE: benchmark/third_party/DeepSpeed/.github/workflows/nv-nightly.yml
================================================
name: nv-nightly

on:
  schedule:
    - cron: "0 0 * * *"

concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

jobs:
  unit-tests:
    runs-on: [self-hosted, nvidia, cu116, v100]

    steps:
      - uses: actions/checkout@v2

      - name: environment
        run: |
          echo "JobID: $AISC_NODE_INSTANCE_ID"
          nvidia-smi
          which python
          python --version
          which nvcc
          nvcc --version
          pip install --upgrade pip
          pip uninstall --yes torch torchvision triton
          pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
          python -c "import torch; print('torch:', torch.__version__, torch)"
          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"

      - name: Install transformers
        run: |
          git clone https://github.com/huggingface/transformers
          cd transformers
          # if needed switch to the last known good SHA until transformers@master is fixed
          # git checkout 1cc453d33
          git rev-parse --short HEAD
          pip uninstall --yes transformers
          pip install .

      - name: Install deepspeed
        run: |
          pip uninstall --yes deepspeed
          pip install .[dev,1bit,autotuning,inf]
          ds_report

      - name: Install lm-eval
        run: |
          pip uninstall --yes lm-eval
          pip install git+https://github.com/EleutherAI/lm-evaluation-harness
          # This is required until lm-eval makes a new release. v0.2.0 is
          # broken for latest version of transformers

      - name: Python environment
        run: |
          pip list

      - name: Unit tests
        run: |
          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
          cd tests
          TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'nightly' unit/ --torch_ver="1.13" --cuda_ver="11.6"


================================================
FILE: benchmark/third_party/DeepSpeed/.github/workflows/nv-torch-latest-v100.yml
================================================
name: nv-torch-latest-v100

on:
  push:
    branches:
      - 'master'
      - 'staging**'
    paths-ignore:
      - 'docs/**'
  pull_request:
    paths-ignore:
      - 'docs/**'

concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

jobs:
  unit-tests:
    runs-on: [self-hosted, nvidia, cu116, v100]

    steps:
      - uses: actions/checkout@v2

      - name: environment
        run: |
          echo "JobID: $AISC_NODE_INSTANCE_ID"
          nvidia-smi
          which python
          python --version
          which nvcc
          nvcc --version
          pip install --upgrade pip
          pip uninstall --yes torch torchvision triton
          pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
          python -c "import torch; print('torch:', torch.__version__, torch)"
          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"

      - name: Install transformers
        run: |
          git clone https://github.com/huggingface/transformers
          cd transformers
          # if needed switch to the last known good SHA until transformers@master is fixed
          # git checkout 1cc453d33
          git rev-parse --short HEAD
          pip uninstall --yes transformers
          pip install .

      - name: Install deepspeed
        run: |
          pip uninstall --yes deepspeed
          pip install .[dev,1bit,autotuning]
          ds_report

      - name: Python environment
        run: |
          pip list

      - name: Unit tests
        run: |
          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
          cd tests
          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose --forked -n 4 unit/ --torch_ver="1.13" --cuda_ver="11.6"
          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose --forked -m 'sequential' unit/ --torch_ver="1.13" --cuda_ver="11.6"


================================================
FILE: benchmark/third_party/DeepSpeed/.github/workflows/nv-torch-nightly-v100.yml
================================================
name: nv-torch-nightly-v100

on:
  schedule:
    - cron: "0 0 * * *"

concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

jobs:
  unit-tests:
    runs-on: [self-hosted, nvidia, cu116, v100]

    steps:
      - uses: actions/checkout@v2

      - name: environment
        run: |
          echo "JobID: $AISC_NODE_INSTANCE_ID"
          nvidia-smi
          which python
          python --version
          which nvcc
          nvcc --version
          pip install --upgrade pip
          pip uninstall --yes torch torchvision triton
          pip install --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu116
          python -c "import torch; print('torch:', torch.__version__, torch)"
          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"

      - name: Install transformers
        run: |
          git clone https://github.com/huggingface/transformers
          cd transformers
          # if needed switch to the last known good SHA until transformers@master is fixed
          # git checkout 1cc453d33
          git rev-parse --short HEAD
          pip uninstall --yes transformers
          pip install .

      - name: Install deepspeed
        run: |
          pip uninstall --yes deepspeed
          pip install .[dev,1bit,autotuning]
          ds_report

      - name: Python environment
        run: |
          pip list

      - name: Unit tests
        run: |
          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
          cd tests
          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/
          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/


================================================
FILE: benchmark/third_party/DeepSpeed/.github/workflows/nv-torch18-p40.yml
================================================
name: nv-torch18-p40

on:
  push:
    branches:
      - 'master'
      - 'staging**'
    paths-ignore:
      - 'docs/**'
  pull_request:
    paths-ignore:
      - 'docs/**'

concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

jobs:
  unit-tests:
    runs-on: [self-hosted, nvidia, cu101, p40]

    steps:
      - uses: actions/checkout@v2

      - name: environment
        run: |
          echo "JobID: $AISC_NODE_INSTANCE_ID"
          nvidia-smi
          which python
          python --version
          which nvcc
          nvcc --version
          pip install --upgrade pip
          pip uninstall --yes torch torchvision triton
          pip install torch==1.8.2 torchvision==0.9.2 --extra-index-url https://download.pytorch.org/whl/lts/1.8/cu101
          python -c "import torch; print('torch:', torch.__version__, torch)"
          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"

      - name: Install transformers
        run: |
          git clone https://github.com/huggingface/transformers
          cd transformers
          # if needed switch to the last known good SHA until transformers@master is fixed
          # git checkout 1cc453d33
          git rev-parse --short HEAD
          pip uninstall --yes transformers
          pip install .

      - name: Install deepspeed
        run: |
          pip uninstall --yes deepspeed
          pip install .[dev,1bit,autotuning]
          ds_report

      - name: Python environment
        run: |
          pip list

      - name: Unit tests
        run: |
          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
          cd tests
          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/ --torch_ver="1.8" --cuda_ver="10.1"


================================================
FILE: benchmark/third_party/DeepSpeed/.github/workflows/nv-torch18-v100.yml
================================================
name: nv-torch18-v100

on:
  push:
    branches:
      - 'master'
      - 'staging**'
    paths-ignore:
      - 'docs/**'
  pull_request:
    paths-ignore:
      - 'docs/**'

concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

jobs:
  unit-tests:
    runs-on: [self-hosted, nvidia, cu111, v100]

    steps:
      - uses: actions/checkout@v2

      - name: environment
        run: |
          echo "JobID: $AISC_NODE_INSTANCE_ID"
          nvidia-smi
          which python
          python --version
          which nvcc
          nvcc --version
          pip install --upgrade pip
          pip uninstall --yes torch torchvision triton
          pip install torch==1.8.2+cu111 torchvision==0.9.2+cu111 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
          python -c "import torch; print('torch:', torch.__version__, torch)"
          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"

      - name: Install transformers
        run: |
          git clone https://github.com/huggingface/transformers
          cd transformers
          # if needed switch to the last known good SHA until transformers@master is fixed
          # git checkout 1cc453d33
          git rev-parse --short HEAD
          pip uninstall --yes transformers
          pip install .

      - name: Install deepspeed
        run: |
          pip uninstall --yes deepspeed
          pip install .[dev,1bit,autotuning]
          ds_report

      - name: Python environment
        run: |
          pip list

      - name: Unit tests
        run: |
          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
          cd tests
          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4  unit/ --torch_ver="1.8" --cuda_ver="11"
          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/ --torch_ver="1.8" --cuda_ver="11"


================================================
FILE: benchmark/third_party/DeepSpeed/.github/workflows/nv-transformers-v100.yml
================================================
name: nv-transformers-v100

on:
  push:
    branches:
      - 'master'
      - 'staging**'
    paths-ignore:
      - 'docs/**'
  pull_request:
    paths-ignore:
      - 'docs/**'

concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

jobs:
  unit-tests:
    runs-on: [self-hosted, nvidia, cu111, v100]

    steps:
      - uses: actions/checkout@v2

      - name: environment
        run: |
          echo "JobID: $AISC_NODE_INSTANCE_ID"
          nvidia-smi
          which python
          python --version
          which nvcc
          nvcc --version
          pip install --upgrade pip
          pip uninstall --yes torch torchvision triton
          pip install torch==1.8.2+cu111 torchvision==0.9.2+cu111 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
          python -c "import torch; print('torch:', torch.__version__, torch)"
          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
          sudo apt-get update
          sudo apt-get install -y libaio-dev

      - name: Install deepspeed
        run: |
          pip uninstall --yes deepspeed
          pip install .[dev,autotuning]
          ds_report

      - name: Python environment
        run: |
          pip list

      - name: HF transformers tests
        run: |
          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
          git clone https://github.com/huggingface/transformers
          cd transformers
          # if needed switch to the last known good SHA until transformers@master is fixed
          #git checkout 6268694e2
          git rev-parse --short HEAD
          # scipy/sklearn required for tests, using the 'dev' extra forces torch re-install
          pip install .[testing]
          # find reqs used in ds integration tests
          find examples/pytorch -regextype posix-egrep -regex '.*(language-modeling|question-answering|summarization|image-classification|text-classification|translation).*/requirements.txt' -exec grep -v 'torch' {} \; | xargs -I {} pip install --upgrade {}
          # force datasets version due to issues
          pip install datasets==2.2.2
          # force protobuf version due to issues
          pip install "protobuf<4.21.0"
          pip list
          HF_DATASETS_CACHE=/blob/datasets_cache/ TRANSFORMERS_CACHE=/blob/transformers_cache/ WANDB_DISABLED=true TORCH_EXTENSIONS_DIR=./torch-extensions RUN_SLOW=1 pytest --color=yes --durations=0 --verbose tests/deepspeed


================================================
FILE: benchmark/third_party/DeepSpeed/.github/workflows/pre-compile-ops.yml
================================================
# This is a basic workflow to help you get started with Actions

name: Tests-w-precompiled-ops

# Controls when the action will run.
on:
  # Allows you to run this workflow manually from the Actions tab
  workflow_dispatch:

# A workflow run is made up of one or more jobs that can run sequentially or in parallel
jobs:
  # This workflow contains a single job called "build"
  build:
    # The type of runner that the job will run on
    runs-on: self-hosted

    # Steps represent a sequence of tasks that will be executed as part of the job
    steps:
      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
      - uses: actions/checkout@v2

      # Runs a single command using the runners shell
      - name: environment
        run: |
          nvidia-smi
          which python
          python --version
          which nvcc
          nvcc --version
          python -c "import torch; print('torch:', torch.__version__, torch)"
          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"

      # Runs a set of commands using the runners shell
      - name: Install deepspeed
        run: |
          DS_BUILD_OPS=1 pip install .[dev]
          ds_report

      - name: Formatting checks
        run: |
           pre-commit run --all-files

      # Runs a set of commands using the runners shell
      - name: Unit tests
        run: |
          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --durations=0 --forked --verbose -x tests/unit/


================================================
FILE: benchmark/third_party/DeepSpeed/.github/workflows/python.yml
================================================
name: python

on:
  push:
    branches:
      - 'master'
      - 'staging**'
  pull_request:
    branches:
      '**'

concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

jobs:
  version-check:
    strategy:
      matrix:
        pyVersion: ["3.6", "3.7", "3.8", "3.9", "3.10"]
      fail-fast: false

    runs-on: ubuntu-20.04
    container:
      image: deepspeed/gh-builder:py${{ matrix.pyVersion }}

    steps:
        - uses: actions/checkout@v2

        - name: environment
          run: |
            which python
            python --version
        - name: Install deepspeed
          run: |
            pip3 install .
        - name: DS Report
          run: |
             ds_report


================================================
FILE: benchmark/third_party/DeepSpeed/.gitignore
================================================
*.pyc
.idea/
*~
*.swp
*.log
deepspeed/git_version_info_installed.py
__pycache__

# Build + installation data
build/
dist/
*.so
deepspeed.egg-info/
build.txt

# Website
docs/_site/
docs/build
docs/code-docs/source/_build
docs/code-docs/_build
docs/code-docs/build
.sass-cache/
.jekyll-cache/
.jekyll-metadata

# Testing data
tests/unit/saved_checkpoint/

# Dev/IDE data
.vscode
.theia


================================================
FILE: benchmark/third_party/DeepSpeed/.pre-commit-config.yaml
================================================
repos:
-   repo: meta
    hooks:
    -   id: check-hooks-apply
    -   id: check-useless-excludes

-   repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v4.0.1
    hooks:
    -   id: check-case-conflict
    -   id: check-json
    -   id: check-symlinks
    -   id: check-yaml
    -   id: destroyed-symlinks
    -   id: end-of-file-fixer
        exclude: docs/CNAME
    -   id: fix-byte-order-marker
    -   id: fix-encoding-pragma
        args: [--remove]
    -   id: mixed-line-ending
        args: [--fix=lf]
    -   id: requirements-txt-fixer
    -   id: trailing-whitespace

-   repo: https://github.com/pre-commit/mirrors-yapf
    rev: v0.31.0
    hooks:
    -   id: yapf

-   repo: https://gitlab.com/daverona/pre-commit-cpp
    rev: 0.8.0
    hooks:
    -   id: clang-format  # formatter of C/C++ code based on a style guide: LLVM, Google, Chromium, Mozilla, and WebKit available
        args: []

-   repo: local
    hooks:
    -   id: check-torchdist
        name: check-torchdist
        entry: ./scripts/check-torchdist.py
        language: script
        exclude: ^(deepspeed/comm/|docs/|benchmarks/|scripts/check-torchdist.py|deepspeed/moe/sharded_moe.py|deepspeed/runtime/comm/coalesced_collectives.py|deepspeed/elasticity/elastic_agent.py|deepspeed/launcher/launch.py|tests/unit/comm/test_dist.py)
        # Specific deepspeed/ files are excluded for now until we wrap ProcessGroup in deepspeed.comm

-   repo: https://github.com/codespell-project/codespell
    rev: v2.1.0
    hooks:
    -   id: codespell
        args: [
            # Do not check files that are automatically generated
            '--skip=docs/Gemfile.lock,tests/unit/gpt2-merges.txt,tests/unit/gpt2-vocab.json',
            '--ignore-regex=\\n',  # Do not count the 'n' in an escaped newline as part of a word
            '--ignore-words-list=unsupport',  # Word used in error messages that need rewording
            --check-filenames,
            --check-hidden
        ]

-   repo: https://github.com/pycqa/flake8
    rev: 4.0.1
    hooks:
    -   id: flake8
        args: ['--ignore=E,F403,F405,F541,F841,W', '--select=E9,F,W6', '--per-file-ignores=__init__.py:F401']


================================================
FILE: benchmark/third_party/DeepSpeed/.pylintrc
================================================
[MASTER]

# A comma-separated list of package or module names from where C extensions may
# be loaded. Extensions are loading into the active Python interpreter and may
# run arbitrary code.
extension-pkg-whitelist=

# Add files or directories to the blacklist. They should be base names, not
# paths.
ignore=CVS

# Add files or directories matching the regex patterns to the blacklist. The
# regex matches against base names, not paths.
ignore-patterns=

# Python code to execute, usually for sys.path manipulation such as
# pygtk.require().
#init-hook=

# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
# number of processors available to use.
jobs=1

# Control the amount of potential inferred values when inferring a single
# object. This can help the performance when dealing with large functions or
# complex, nested conditions.
limit-inference-results=100

# List of plugins (as comma separated values of python module names) to load,
# usually to register additional checkers.
load-plugins=

# Pickle collected data for later comparisons.
persistent=yes

# Specify a configuration file.
#rcfile=

# When enabled, pylint would attempt to guess common misconfiguration and emit
# user-friendly hints instead of false-positive error messages.
suggestion-mode=yes

# Allow loading of arbitrary C extensions. Extensions are imported into the
# active Python interpreter and may run arbitrary code.
unsafe-load-any-extension=no


[MESSAGES CONTROL]

# Only show warnings with the listed confidence levels. Leave empty to show
# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
confidence=

# Disable the message, report, category or checker with the given id(s). You
# can either give multiple identifiers separated by comma (,) or put this
# option multiple times (only on the command line, not in the configuration
# file where it should appear only once). You can also use "--disable=all" to
# disable everything first and then re-enable specific checks. For example, if
# you want to run only the similarities checker, you can use "--disable=all
# --enable=similarities". If you want to run only the classes checker, but have
# no Warning level messages displayed, use "--disable=all --enable=classes
# --disable=W".
disable=print-statement,
        parameter-unpacking,
        unpacking-in-except,
        old-raise-syntax,
        backtick,
        long-suffix,
        old-ne-operator,
        old-octal-literal,
        import-star-module-level,
        non-ascii-bytes-literal,
        raw-checker-failed,
        bad-inline-option,
        locally-disabled,
        file-ignored,
        suppressed-message,
        useless-suppression,
        deprecated-pragma,
        use-symbolic-message-instead,
        apply-builtin,
        basestring-builtin,
        buffer-builtin,
        cmp-builtin,
        coerce-builtin,
        execfile-builtin,
        file-builtin,
        long-builtin,
        raw_input-builtin,
        reduce-builtin,
        standarderror-builtin,
        unicode-builtin,
        xrange-builtin,
        coerce-method,
        delslice-method,
        getslice-method,
        setslice-method,
        no-absolute-import,
        old-division,
        dict-iter-method,
        dict-view-method,
        next-method-called,
        metaclass-assignment,
        indexing-exception,
        raising-string,
        reload-builtin,
        oct-method,
        hex-method,
        nonzero-method,
        cmp-method,
        input-builtin,
        round-builtin,
        intern-builtin,
        unichr-builtin,
        map-builtin-not-iterating,
        zip-builtin-not-iterating,
        range-builtin-not-iterating,
        filter-builtin-not-iterating,
        using-cmp-argument,
        eq-without-hash,
        div-method,
        idiv-method,
        rdiv-method,
        exception-message-attribute,
        invalid-str-codec,
        sys-max-int,
        bad-python3-import,
        deprecated-string-function,
        deprecated-str-translate-call,
        deprecated-itertools-function,
        deprecated-types-field,
        next-method-defined,
        dict-items-not-iterating,
        dict-keys-not-iterating,
        dict-values-not-iterating,
        deprecated-operator-function,
        deprecated-urllib-function,
        xreadlines-attribute,
        deprecated-sys-function,
        exception-escape,
        comprehension-escape

# Enable the message, report, category or checker with the given id(s). You can
# either give multiple identifier separated by comma (,) or put this option
# multiple time (only on the command line, not in the configuration file where
# it should appear only once). See also the "--disable" option for examples.
enable=c-extension-no-member


[REPORTS]

# Python expression which should return a score less than or equal to 10. You
# have access to the variables 'error', 'warning', 'refactor', and 'convention'
# which contain the number of messages in each category, as well as 'statement'
# which is the total number of statements analyzed. This score is used by the
# global evaluation report (RP0004).
evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)

# Template used to display messages. This is a python new-style format string
# used to format the message information. See doc for all details.
#msg-template=

# Set the output format. Available formats are text, parseable, colorized, json
# and msvs (visual studio). You can also give a reporter class, e.g.
# mypackage.mymodule.MyReporterClass.
output-format=text

# Tells whether to display a full report or only the messages.
reports=no

# Activate the evaluation score.
score=yes


[REFACTORING]

# Maximum number of nested blocks for function / method body
max-nested-blocks=5

# Complete name of functions that never returns. When checking for
# inconsistent-return-statements if a never returning function is called then
# it will be considered as an explicit return statement and no message will be
# printed.
never-returning-functions=sys.exit


[BASIC]

# Naming style matching correct argument names.
argument-naming-style=snake_case

# Regular expression matching correct argument names. Overrides argument-
# naming-style.
#argument-rgx=

# Naming style matching correct attribute names.
attr-naming-style=snake_case

# Regular expression matching correct attribute names. Overrides attr-naming-
# style.
#attr-rgx=

# Bad variable names which should always be refused, separated by a comma.
bad-names=foo,
          bar,
          baz,
          toto,
          tutu,
          tata

# Naming style matching correct class attribute names.
class-attribute-naming-style=any

# Regular expression matching correct class attribute names. Overrides class-
# attribute-naming-style.
#class-attribute-rgx=

# Naming style matching correct class names.
class-naming-style=PascalCase

# Regular expression matching correct class names. Overrides class-naming-
# style.
#class-rgx=

# Naming style matching correct constant names.
const-naming-style=UPPER_CASE

# Regular expression matching correct constant names. Overrides const-naming-
# style.
#const-rgx=

# Minimum line length for functions/classes that require docstrings, shorter
# ones are exempt.
docstring-min-length=-1

# Naming style matching correct function names.
function-naming-style=snake_case

# Regular expression matching correct function names. Overrides function-
# naming-style.
#function-rgx=

# Good variable names which should always be accepted, separated by a comma.
good-names=i,
           j,
           k,
           ex,
           Run,
           _

# Include a hint for the correct naming format with invalid-name.
include-naming-hint=no

# Naming style matching correct inline iteration names.
inlinevar-naming-style=any

# Regular expression matching correct inline iteration names. Overrides
# inlinevar-naming-style.
#inlinevar-rgx=

# Naming style matching correct method names.
method-naming-style=snake_case

# Regular expression matching correct method names. Overrides method-naming-
# style.
#method-rgx=

# Naming style matching correct module names.
module-naming-style=snake_case

# Regular expression matching correct module names. Overrides module-naming-
# style.
#module-rgx=

# Colon-delimited sets of names that determine each other's naming style when
# the name regexes allow several styles.
name-group=

# Regular expression which should only match function or class names that do
# not require a docstring.
no-docstring-rgx=^_

# List of decorators that produce properties, such as abc.abstractproperty. Add
# to this list to register other decorators that produce valid properties.
# These decorators are taken in consideration only for invalid-name.
property-classes=abc.abstractproperty

# Naming style matching correct variable names.
variable-naming-style=snake_case

# Regular expression matching correct variable names. Overrides variable-
# naming-style.
#variable-rgx=


[LOGGING]

# Format style used to check logging format string. `old` means using %
# formatting, `new` is for `{}` formatting,and `fstr` is for f-strings.
logging-format-style=old

# Logging modules to check that the string format arguments are in logging
# function parameter format.
logging-modules=logging


[TYPECHECK]

# List of decorators that produce context managers, such as
# contextlib.contextmanager. Add to this list to register other decorators that
# produce valid context managers.
contextmanager-decorators=contextlib.contextmanager

# List of members which are set dynamically and missed by pylint inference
# system, and so shouldn't trigger E1101 when accessed. Python regular
# expressions are accepted.
generated-members=

# Tells whether missing members accessed in mixin class should be ignored. A
# mixin class is detected if its name ends with "mixin" (case insensitive).
ignore-mixin-members=yes

# Tells whether to warn about missing members when the owner of the attribute
# is inferred to be None.
ignore-none=yes

# This flag controls whether pylint should warn about no-member and similar
# checks whenever an opaque object is returned when inferring. The inference
# can return multiple potential results while evaluating a Python object, but
# some branches might not be evaluated, which results in partial inference. In
# that case, it might be useful to still emit no-member and other checks for
# the rest of the inferred objects.
ignore-on-opaque-inference=yes

# List of class names for which member attributes should not be checked (useful
# for classes with dynamically set attributes). This supports the use of
# qualified names.
ignored-classes=optparse.Values,thread._local,_thread._local

# List of module names for which member attributes should not be checked
# (useful for modules/projects where namespaces are manipulated during runtime
# and thus existing member attributes cannot be deduced by static analysis). It
# supports qualified module names, as well as Unix pattern matching.
ignored-modules=

# Show a hint with possible names when a member name was not found. The aspect
# of finding the hint is based on edit distance.
missing-member-hint=yes

# The minimum edit distance a name should have in order to be considered a
# similar match for a missing member name.
missing-member-hint-distance=1

# The total number of similar names that should be taken in consideration when
# showing a hint for a missing member.
missing-member-max-choices=1

# List of decorators that change the signature of a decorated function.
signature-mutators=


[SIMILARITIES]

# Ignore comments when computing similarities.
ignore-comments=yes

# Ignore docstrings when computing similarities.
ignore-docstrings=yes

# Ignore imports when computing similarities.
ignore-imports=no

# Minimum lines number of a similarity.
min-similarity-lines=4


[STRING]

# This flag controls whether the implicit-str-concat-in-sequence should
# generate a warning on implicit string concatenation in sequences defined over
# several lines.
check-str-concat-over-line-jumps=no


[VARIABLES]

# List of additional names supposed to be defined in builtins. Remember that
# you should avoid defining new builtins when possible.
additional-builtins=

# Tells whether unused global variables should be treated as a violation.
allow-global-unused-variables=yes

# List of strings which can identify a callback function by name. A callback
# name must start or end with one of those strings.
callbacks=cb_,
          _cb

# A regular expression matching the name of dummy variables (i.e. expected to
# not be used).
dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_

# Argument names that match this expression will be ignored. Default to name
# with leading underscore.
ignored-argument-names=_.*|^ignored_|^unused_

# Tells whether we should check for unused import in __init__ files.
init-import=no

# List of qualified module names which can have objects that can redefine
# builtins.
redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io


[FORMAT]

# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
expected-line-ending-format=

# Regexp for a line that is allowed to be longer than the limit.
ignore-long-lines=^\s*(# )?<?https?://\S+>?$

# Number of spaces of indent required inside a hanging or continued line.
indent-after-paren=4

# String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
# tab).
indent-string='    '

# Maximum number of characters on a single line.
max-line-length=90

# Maximum number of lines in a module.
max-module-lines=1000

# List of optional constructs for which whitespace checking is disabled. `dict-
# separator` is used to allow tabulation in dicts, etc.: {1  : 1,\n222: 2}.
# `trailing-comma` allows a space between comma and closing bracket: (a, ).
# `empty-line` allows space-only lines.
no-space-check=trailing-comma,
               dict-separator

# Allow the body of a class to be on the same line as the declaration if body
# contains single statement.
single-line-class-stmt=no

# Allow the body of an if to be on the same line as the test if there is no
# else.
single-line-if-stmt=no


[MISCELLANEOUS]

# List of note tags to take in consideration, separated by a comma.
notes=FIXME,
      XXX,
      TODO


[SPELLING]

# Limits count of emitted suggestions for spelling mistakes.
max-spelling-suggestions=4

# Spelling dictionary name. Available dictionaries: none. To make it work,
# install the python-enchant package.
spelling-dict=

# List of comma separated words that should not be checked.
spelling-ignore-words=

# A path to a file that contains the private dictionary; one word per line.
spelling-private-dict-file=

# Tells whether to store unknown words to the private dictionary (see the
# --spelling-private-dict-file option) instead of raising a message.
spelling-store-unknown-words=no


[CLASSES]

# List of method names used to declare (i.e. assign) instance attributes.
defining-attr-methods=__init__,
                      __new__,
                      setUp,
                      __post_init__

# List of member names, which should be excluded from the protected access
# warning.
exclude-protected=_asdict,
                  _fields,
                  _replace,
                  _source,
                  _make

# List of valid names for the first argument in a class method.
valid-classmethod-first-arg=cls

# List of valid names for the first argument in a metaclass class method.
valid-metaclass-classmethod-first-arg=cls


[DESIGN]

# Maximum number of arguments for function / method.
max-args=10

# Maximum number of attributes for a class (see R0902).
max-attributes=20

# Maximum number of boolean expressions in an if statement (see R0916).
max-bool-expr=5

# Maximum number of branch for function / method body.
max-branches=12

# Maximum number of locals for function / method body.
max-locals=15

# Maximum number of parents for a class (see R0901).
max-parents=7

# Maximum number of public methods for a class (see R0904).
max-public-methods=20

# Maximum number of return / yield for function / method body.
max-returns=6

# Maximum number of statements in function / method body.
max-statements=50

# Minimum number of public methods for a class (see R0903).
min-public-methods=2


[IMPORTS]

# List of modules that can be imported at any level, not just the top level
# one.
allow-any-import-level=

# Allow wildcard imports from modules that define __all__.
allow-wildcard-with-all=no

# Analyse import fallback blocks. This can be used to support both Python 2 and
# 3 compatible code, which means that the block might have code that exists
# only in one or another interpreter, leading to false positives when analysed.
analyse-fallback-blocks=no

# Deprecated modules which should not be used, separated by a comma.
deprecated-modules=optparse,tkinter.tix

# Create a graph of external dependencies in the given file (report RP0402 must
# not be disabled).
ext-import-graph=

# Create a graph of every (i.e. internal and external) dependencies in the
# given file (report RP0402 must not be disabled).
import-graph=

# Create a graph of internal dependencies in the given file (report RP0402 must
# not be disabled).
int-import-graph=

# Force import order to recognize a module as part of the standard
# compatibility libraries.
known-standard-library=

# Force import order to recognize a module as part of a third party library.
known-third-party=enchant

# Couples of modules and preferred modules, separated by a comma.
preferred-modules=


[EXCEPTIONS]

# Exceptions that will emit a warning when being caught. Defaults to
# "BaseException, Exception".
overgeneral-exceptions=BaseException,
                       Exception


================================================
FILE: benchmark/third_party/DeepSpeed/.readthedocs.yml
================================================

# Required
version: 2

# Build documentation in the docs/ directory with Sphinx
sphinx:
  configuration: docs/code-docs/source/conf.py
  fail_on_warning: false

# Optionally build your docs in additional formats such as PDF
formats:
  - pdf

# Optionally set the version of Python and requirements required to build your docs
python:
  version: 3.7
  install:
    - requirements: requirements/requirements-readthedocs.txt


================================================
FILE: benchmark/third_party/DeepSpeed/.style.yapf
================================================
[style]
SPLIT_ALL_COMMA_SEPARATED_VALUES = true
COLUMN_LIMIT = 89


================================================
FILE: benchmark/third_party/DeepSpeed/CODEOWNERS
================================================
# This file is used to subscribe for notifications for PRs
# related to specific file paths, does not necessarily mean
# approval is required from these people before merging.
#
# Learn more about CODEOWNERS syntax here:
# https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners


# top-level repo folders
/.github/ @jeffra @mrwyattii
/azure/ @jeffra @awan-10
/benchmarks/ @jeffra @awan-10 @mrwyattii @molly-smith
/bin/ @jeffra
/csrc/ @RezaYazdaniAminabadi @awan-10 @jeffra @cmikeh2 @arashb
/deepspeed/ @jeffra
/docker/ @jeffra @awan-10
/docs/ @jeffra @mrwyattii
/examples/ @jeffra @awan-10 @mrwyattii
/op_builder/ @jeffra @RezaYazdaniAminabadi @cmikeh2
/release/ @jeffra @mrwyattii
/requirements/ @jeffra @mrwyattii
/scripts/ @jeffra @awan-10
/tests/ @jeffra @mrwyattii @tjruwase

# deepspeed
/deepspeed/autotuning/ @cli99
/deepspeed/checkpoint/ @tjruwase
/deepspeed/comm/ @awan-10
/deepspeed/compression/ @yaozhewei @minjiaz @xiaoxiawu-microsoft @conglongli
/deepspeed/elasticity/ @jeffra @awan-10
/deepspeed/launcher/ @jeffra @awan-10
/deepspeed/module_inject/ @RezaYazdaniAminabadi @jeffra @mrwyattii @awan-10 @cmikeh2 @arashb
/deepspeed/moe/ @awan-10
/deepspeed/monitor/ @awan-10 @jeffra
/deepspeed/nebula/ @tjruwase @jeffra
/deepspeed/ops/ @RezaYazdaniAminabadi @jeffra @mrwyattii @awan-10 @cmikeh2 @arashb
/deepspeed/pipe/ @ShadenSmith @duli2012
/deepspeed/profiling/ @cli99
/deepspeed/utils/ @jeffra @tjruwase @awan-10

# inference
/deepspeed/inference/ @RezaYazdaniAminabadi @jeffra @mrwyattii @awan-10 @cmikeh2 @arashb
/deepspeed/model_implementations/ @RezaYazdaniAminabadi @jeffra @mrwyattii @awan-10 @cmikeh2 @arashb

# training
/deepspeed/runtime/ @jeffra @tjruwase
/deepspeed/runtime/activation_checkpointing/ @jeffra @tjruwase
/deepspeed/runtime/checkpoint_engine/ @tjruwase @jeffra
/deepspeed/runtime/comm/ @awan-10
/deepspeed/runtime/compression/ @awan-10 @conglongli
/deepspeed/runtime/data_pipeline/ @conglongli
/deepspeed/runtime/fp16/ @jeffra @tjruwase
/deepspeed/runtime/fp16/onebit/ @conglongli @awan-10
/deepspeed/runtime/pipe/ @ShadenSmith @duli2012
/deepspeed/runtime/swap_tensor/ @tjruwase @mrwyattii
/deepspeed/runtime/zero/ @jeffra @tjruwase @samyam @mrwyattii


================================================
FILE: benchmark/third_party/DeepSpeed/CODE_OF_CONDUCT.md
================================================
# Microsoft Open Source Code of Conduct

This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).

Resources:

- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns


================================================
FILE: benchmark/third_party/DeepSpeed/CONTRIBUTING.md
================================================
# Contributing
DeepSpeed welcomes your contributions!

## Prerequisites
DeepSpeed uses [pre-commit](https://pre-commit.com/) to ensure that formatting is
consistent across DeepSpeed. First, ensure that `pre-commit` is installed from either
installing DeepSpeed or `pip install pre-commit`. Next, the pre-commit hooks must be
installed once before commits can be made:
```bash
pre-commit install
```

Afterwards, our suite of formatting tests run automatically before each `git commit`. You
can also run these manually:
```bash
pre-commit run --all-files
```
If a formatting test fails, it will fix the modified code in place and abort
the `git commit`. After looking over the changes, you can `git add <modified files>`
and then repeat the previous `git commit` command.


## Testing
DeepSpeed tracks two types of tests: unit tests and more costly model convergence tests.
The model convergence tests train
[DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/) and measure
end-to-end convergence and related metrics. Unit tests are found in `tests/unit/` and
the model convergence tests are found in `tests/model/`.

### Unit Tests
[PyTest](https://docs.pytest.org/en/latest/) is used to execute tests. PyTest can be
installed from PyPI via `pip install pytest`. Simply invoke `pytest --forked` to run the
unit tests:
```bash
pytest --forked tests/unit/
```
You can also provide the `-v` flag to `pytest` to see additional information about the
tests. Note that [pytest-forked](https://github.com/pytest-dev/pytest-forked) and the
`--forked` flag are required to test CUDA functionality in distributed tests.

### Model Tests
To execute model tests, first [install DeepSpeed](#installation). The
[DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/) repository is cloned
as part of this process. Next, execute the model test driver:
```bash
cd tests/model/
pytest run_sanity_check.py
```
Note that the `--forked` flag is not necessary for the model tests.

## Contributor License Agreement
This project welcomes contributions and suggestions. Most contributions require you to
agree to a Contributor License Agreement (CLA) declaring that you have the right to, and
actually do, grant us the rights to use your contribution. For details, visit
https://cla.opensource.microsoft.com.

When you submit a pull request, a CLA bot will automatically determine whether you need
to provide a CLA and decorate the PR appropriately (e.g., status check, comment). Simply
follow the instructions provided by the bot. You will only need to do this once across
all repos using our CLA.

## Code of Conduct
This project has adopted the [Microsoft Open Source Code of
Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the
[Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact
[opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or
comments.

## New Feature Contribution Guidelines
Unlike bug fix or improving existing feature (where users usually directly submit a PR and we review it), adding a new feature to DeepSpeed requires several steps: (1) proposal and discussion, (2) implementation and verification, (3) release and maintenance. This general guideline applies to all new feature contributions. Core DeepSpeed team member contributions may complete step 1 internally.

### Step 1: proposal and discussion
We ask users to first post your intended feature in an issue. This issue needs to include:

* A description of the proposed feature.
* A motivation of why it will be useful to DeepSpeed users.
* A rough design of how you implement the feature inside DeepSpeed.
* (Important) Results or planned experiments to demonstrate the effectiveness and correctness of the feature.
  * If this is a general feature applicable to different tasks, we require testing it on at least one CV task (e.g., [CIFAR](https://www.deepspeed.ai/tutorials/cifar-10/)) and one NLP task (e.g., [SQuAD](https://www.deepspeed.ai/tutorials/bert-finetuning/)). If this is a feature for one kind of task only, it is fine to just test on the specific task.
  * If the feature only affects performance and does not affect training convergence, we require testing on a fraction of training to demonstrate that the training/validation loss are consistent with baseline, and that the performance is better than baseline.
  * If the feature does affect training convergence, we require testing the whole training to demonstrate that the feature achieves better/on-par final model quality and training performance compared to baseline.

Based on the issue we shall discuss the merit of the new feature and decide whether accept or decline the proposal. Once accepted and after we confirm the design and implementation plan, we are ready for step 2.

### Step 2: implementation and verification
Contributor will go ahead and implement the feature, and the DeepSpeed team will provide guidance/helps as needed. The required deliverables include:

* A PR to [microsoft/DeepSpeed](https://github.com/microsoft/DeepSpeed) including (1) the feature implementation (2) unit tests (3) documentation (4) tutorial
* A PR to [microsoft/DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) or [microsoft/Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed) including the examples of how to use the feature (this is related to the planned testing experiments in proposal)
* In the implementation (code, documentation, tutorial), we require the feature author to record their GitHub username as a contact method for future questions/maintenance.

After receiving the PRs, we will review them and merge them after necessary tests/fixes.

### Step 3: release and maintenance
After the PRs are merged, we will announce the feature on our website (with credit to the feature author). We ask the feature author to commit to the maintenance of the feature.


================================================
FILE: benchmark/third_party/DeepSpeed/LICENSE
================================================
    MIT License

    Copyright (c) Microsoft Corporation.

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:

    The above copyright notice and this permission notice shall be included in all
    copies or substantial portions of the Software.

    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
    SOFTWARE


================================================
FILE: benchmark/third_party/DeepSpeed/MANIFEST.in
================================================
include *.txt README.md
recursive-include requirements *.txt
recursive-include deepspeed *.cpp *.h *.cu *.hip *.tr *.cuh *.cc *.json
recursive-include csrc *.cpp *.h *.cu *.tr *.cuh *.cc
recursive-include op_builder *.py
recursive-include benchmarks *.py


================================================
FILE: benchmark/third_party/DeepSpeed/MANIFEST_win.in
================================================
include *.txt README.md
recursive-include requirements *.txt

# this is for Windows only
recursive-include deepspeed *.tr
recursive-exclude deepspeed/ops/csrc *.cpp *.h *.cu *.cuh *.cc
prune csrc
prune op_builder


================================================
FILE: benchmark/third_party/DeepSpeed/README.md
================================================
[![License MIT](https://badgen.net/badge/license/MIT/blue)](https://github.com/Microsoft/DeepSpeed/blob/master/LICENSE)
[![PyPI version](https://badge.fury.io/py/deepspeed.svg)](https://pypi.org/project/deepspeed/)
[![Downloads](https://pepy.tech/badge/deepspeed)](https://pepy.tech/project/deepspeed)
[![Build](https://badgen.net/badge/build/check-status/blue)](#build-pipeline-status)


<div align="center">
 <img src="docs/assets/images/DeepSpeed_light.svg#gh-light-mode-only" width="400px">
 <img src="docs/assets/images/DeepSpeed_dark_transparent.svg#gh-dark-mode-only" width="400px">
</div>

## Latest News
<b> DeepSpeed trained the world's most powerful language models ([MT-530B](https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/), [BLOOM](https://huggingface.co/blog/bloom-megatron-deepspeed)); [learn how](https://www.deepspeed.ai/tutorials/large-models-w-deepspeed/).</b>

* [2022/11] [Stable Diffusion Image Generation under 1 second w. DeepSpeed MII](https://github.com/microsoft/DeepSpeed-MII/tree/main/examples/benchmark/txt2img)
* [2022/10] [DeepSpeed-MII: instant speedup on 24,000+ open-source DL models with up to 40x cheaper inference](https://www.deepspeed.ai/2022/10/10/mii.html)
* [2022/09] [ZeRO-Inference: Democratizing massive model inference](https://www.deepspeed.ai/2022/09/09/zero-inference.html)
* [2022/07] [Azure and DeepSpeed empower easy-to-use and high-performance model training](https://azure.microsoft.com/en-us/blog/azure-empowers-easytouse-highperformance-and-hyperscale-model-training-using-deepspeed/)
* [2022/07] [DeepSpeed Compression: A composable library for extreme compression](https://www.microsoft.com/en-us/research/blog/deepspeed-compression-a-composable-library-for-extreme-compression-and-zero-cost-quantization/)

---

# Extreme Speed and Scale for DL Training and Inference

[DeepSpeed](https://www.deepspeed.ai/) is an easy-to-use deep learning optimization software suite that enables unprecedented scale and speed for Deep Learning Training and Inference. With DeepSpeed you can:

* Train/Inference dense or sparse models with billions or trillions of parameters
* Achieve excellent system throughput and efficiently scale to thousands of GPUs
* Train/Inference on resource constrained GPU systems
* Achieve unprecedented low latency and high throughput for inference
* Achieve extreme compression for an unparalleled inference latency and model size reduction with low costs

---

# DeepSpeed's three innovation pillars

<img src="docs/assets/images/3pillars.png" width="800px">


## DeepSpeed-Training

DeepSpeed offers a confluence of system innovations, that has made large scale DL training effective, and efficient, greatly improved ease of use, and redefined the DL training landscape in terms of scale that is possible. These innovations such as ZeRO, 3D-Parallelism, DeepSpeed-MoE, ZeRO-Infinity, etc. fall under the training pillar. Learn more: [DeepSpeed-Training](https://www.deepspeed.ai/training/)

## DeepSpeed-Inference

DeepSpeed brings together innovations in parallelism technology such as tensor, pipeline, expert and ZeRO-parallelism, and combines them with high performance custom inference kernels, communication optimizations and heterogeneous memory technologies to enable inference at an unprecedented scale, while achieving unparalleled latency, throughput and cost reduction. This systematic composition of system technologies for inference falls under the inference pillar. Learn more: [DeepSpeed-Inference](https://www.deepspeed.ai/inference)


## DeepSpeed-Compression

To further increase the inference efficiency, DeepSpeed offers easy-to-use and flexible-to-compose compression techniques for researchers and practitioners to compress their models while delivering faster speed, smaller model size, and significantly reduced compression cost. Moreover, SoTA innovations on compression like ZeroQuant and XTC are included under the compression pillar. Learn more: [DeepSpeed-Compression](https://www.deepspeed.ai/compression)

---

# DeepSpeed Software Suite

## DeepSpeed Library

   The [DeepSpeed](https://github.com/microsoft/deepspeed) library (this repository) implements and packages the innovations and technologies in DeepSpeed Training, Inference and Compression Pillars into a single easy-to-use, open-sourced repository. It allows for easy composition of multitude of features within a single training, inference or compression pipeline. The DeepSpeed Library is heavily adopted by the DL community, and has been used to enable some of the most powerful models (see [DeepSpeed Adoption](#deepspeed-adoption)).

## Model Implementations for Inference (MII)

   [Model Implementations for Inference (MII)](https://github.com/microsoft/deepspeed-mii) is an open-sourced repository for making low-latency and high-throughput inference accessible to all data scientists by alleviating the need to apply complex system optimization techniques themselves. Out-of-box, MII offers support for thousands of widely used DL models, optimized using DeepSpeed-Inference, that can be deployed with a few lines of code, while achieving significant latency reduction compared to their vanilla open-sourced versions.

## DeepSpeed on Azure

   DeepSpeed users are diverse and have access to different environments. We recommend to try DeepSpeed on Azure as it is the simplest and easiest method. The recommended method to try DeepSpeed on Azure is through AzureML [recipes](https://github.com/Azure/azureml-examples/tree/main/python-sdk/workflows/train/deepspeed). The job submission and data preparation scripts have been made available [here](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples/azureml). For more details on how to use DeepSpeed on Azure, please follow the [Azure tutorial](https://www.deepspeed.ai/tutorials/azure/).

---

# DeepSpeed Adoption

DeepSpeed is an important part of Microsoft’s new
[AI at Scale](https://www.microsoft.com/en-us/research/project/ai-at-scale/)
initiative to enable next-generation AI capabilities at scale, where you can find more
information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale).

DeepSpeed has been used to train many different large-scale models, below is a list of several examples that we are aware of (if you'd like to include your model please submit a PR):

  * [Megatron-Turing NLG (530B)](https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/)
  * [Jurassic-1 (178B)](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf)
  * [BLOOM (176B)](https://huggingface.co/blog/bloom-megatron-deepspeed)
  * [GLM (130B)](https://github.com/THUDM/GLM-130B)
  * [YaLM (100B)](https://github.com/yandex/YaLM-100B)
  * [GPT-NeoX (20B)](https://github.com/EleutherAI/gpt-neox)
  * [AlexaTM (20B)](https://www.amazon.science/blog/20b-parameter-alexa-model-sets-new-marks-in-few-shot-learning)
  * [Turing NLG (17B)](https://www.microsoft.com/en-us/research/blog/turing-nlg-a-17-billion-parameter-language-model-by-microsoft/)
  * [METRO-LM (5.4B)](https://arxiv.org/pdf/2204.06644.pdf)

DeepSpeed has been integrated with several different popular open-source DL frameworks such as:

|                                                                                                | Documentation                                |
| ---------------------------------------------------------------------------------------------- | -------------------------------------------- |
<img src="docs/assets/images/transformers-light.png#gh-light-mode-only" width="250px"><img src="docs/assets/images/transformers-dark.png#gh-dark-mode-only" width="250px"> | [Transformers with DeepSpeed](https://huggingface.co/docs/transformers/main/main_classes/deepspeed) |
| <img src="docs/assets/images/accelerate-light.png#gh-light-mode-only" width="250px"><img src="docs/assets/images/accelerate-dark.png#gh-dark-mode-only" width="250px"> | [Accelerate with DeepSpeed](https://huggingface.co/docs/accelerate/usage_guides/deepspeed) |
| <img src="docs/assets/images/lightning-light.svg#gh-light-mode-only" width="200px"><img src="docs/assets/images/lightning-dark.svg#gh-dark-mode-only" width="200px"> | [Lightning with DeepSpeed](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.strategies.DeepSpeedStrategy.html) |
| <img src="docs/assets/images/mosaicml.svg" width="200px"> | [MosaicML with DeepSpeed](https://docs.mosaicml.com/en/latest/trainer/using_the_trainer.html?highlight=deepspeed#deepspeed-integration) |
| <img src="docs/assets/images/determined.svg" width="225px"> | [Determined with DeepSpeed](https://docs.determined.ai/latest/training/apis-howto/deepspeed/overview.html) |

---

# Build Pipeline Status

| Description | Status |
| ----------- | ------ |
| NVIDIA | [![nv-torch12-p40](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch12-p40.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch12-p40.yml) [![nv-torch18-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch18-v100.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch18-v100.yml) [![nv-torch-latest-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-v100.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-v100.yml) [![nv-inference](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-inference.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-inference.yml) [![nv-nightly](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-nightly.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-nightly.yml) |
| AMD | [![amd](https://github.com/microsoft/DeepSpeed/actions/workflows/amd.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/amd.yml) |
| PyTorch Nightly | [![nv-torch-nightly-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml) |
| Integrations | [![nv-transformers-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml) [![nv-lightning-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml) [![nv-accelerate-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml) |
| Misc | [![Formatting](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml) [![pages-build-deployment](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment) [![Documentation Status](https://readthedocs.org/projects/deepspeed/badge/?version=latest)](https://deepspeed.readthedocs.io/en/latest/?badge=latest)|

# Installation

The quickest way to get started with DeepSpeed is via pip, this will install
the latest release of DeepSpeed which is not tied to specific PyTorch or CUDA
versions. DeepSpeed includes several C++/CUDA extensions that we commonly refer
to as our 'ops'.  By default, all of these extensions/ops will be built
just-in-time (JIT) using [torch's JIT C++ extension loader that relies on
ninja](https://pytorch.org/docs/stable/cpp_extension.html) to build and
dynamically link them at runtime.

## Requirements
* [PyTorch](https://pytorch.org/) must be installed _before_ installing DeepSpeed.
* For full feature support we recommend a version of PyTorch that is >= 1.8 and ideally the latest PyTorch stable release.
* A CUDA or ROCm compiler such as [nvcc](https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#introduction) or [hipcc](https://github.com/ROCm-Developer-Tools/HIPCC) used to compile C++/CUDA/HIP extensions.
* Specific GPUs we develop and test against are listed below, this doesn't mean your GPU will not work if it doesn't fall into this category it's just DeepSpeed is most well tested on the following:
  * NVIDIA: Pascal, Volta, Ampere, and Hopper architectures
  * AMD: MI100 and MI200

## PyPI
We regularly push releases to [PyPI](https://pypi.org/project/deepspeed/) and encourage users to install from there in most cases.

```bash
pip install deepspeed
```

After installation, you can validate your install and see which extensions/ops
your machine is compatible with via the DeepSpeed environment report.

```bash
ds_report
```

If you would like to pre-install any of the DeepSpeed extensions/ops (instead
of JIT compiling) or install pre-compiled ops via PyPI please see our [advanced
installation instructions](https://www.deepspeed.ai/tutorials/advanced-install/).

## Windows
Windows support is partially supported with DeepSpeed. On Windows you can build wheel with following steps, currently only inference mode is supported.
1. Install pytorch, such as pytorch 1.8 + cuda 11.1
2. Install visual cpp build tools, such as VS2019 C++ x64/x86 build tools
3. Launch cmd console with Administrator privilege for creating required symlink folders
4. Run `python setup.py bdist_wheel` to build wheel in `dist` folder

# Features

Please checkout [DeepSpeed-Training](https://www.deepspeed.ai/training), [DeepSpeed-Inference](https://www.deepspeed.ai/inference) and [DeepSpeed-Compression](https://www.deepspeed.ai/compression) pages for full set of features offered along each of these three pillars.

# Further Reading

All DeepSpeed documentation, tutorials, and blogs can be found on our website: [deepspeed.ai](https://www.deepspeed.ai/)


|                                                                                                | Description                                  |
| ---------------------------------------------------------------------------------------------- | -------------------------------------------- |
| [Getting Started](https://www.deepspeed.ai/getting-started/)                                   |  First steps with DeepSpeed                  |
| [DeepSpeed JSON Configuration](https://www.deepspeed.ai/docs/config-json/)                     |  Configuring DeepSpeed                       |
| [API Documentation](https://deepspeed.readthedocs.io/en/latest/)                               |  Generated DeepSpeed API documentation       |
| [Tutorials](https://www.deepspeed.ai/tutorials/)                                               |  Tutorials                                   |
| [Blogs](https://www.deepspeed.ai/posts/)                                                       |  Blogs                                   |


# Contributing
DeepSpeed welcomes your contributions! Please see our
[contributing](CONTRIBUTING.md) guide for more details on formatting, testing,
etc.

## Contributor License Agreement
This project welcomes contributions and suggestions. Most contributions require you to
agree to a Contributor License Agreement (CLA) declaring that you have the right to, and
actually do, grant us the rights to use your contribution. For details, visit
https://cla.opensource.microsoft.com.

When you submit a pull request, a CLA bot will automatically determine whether you need
to provide a CLA and decorate the PR appropriately (e.g., status check, comment). Simply
follow the instructions provided by the bot. You will only need to do this once across
all repos using our CLA.

## Code of Conduct
This project has adopted the [Microsoft Open Source Code of
Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the
[Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact
[opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.

# Publications
1. Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, Yuxiong He. (2019) ZeRO: memory optimizations toward training trillion parameter models. [arXiv:1910.02054](https://arxiv.org/abs/1910.02054) and [In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (SC '20)](https://dl.acm.org/doi/10.5555/3433701.3433727).
2. Jeff Rasley, Samyam Rajbhandari, Olatunji Ruwase, and Yuxiong He. (2020) DeepSpeed: System Optimizations Enable Training Deep Learning Models with Over 100 Billion Parameters. [In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining (KDD '20, Tutorial)](https://dl.acm.org/doi/10.1145/3394486.3406703).
3. Minjia Zhang, Yuxiong He. (2020) Accelerating Training of Transformer-Based Language Models with Progressive Layer Dropping. [arXiv:2010.13369](https://arxiv.org/abs/2010.13369) and [NeurIPS 2020](https://proceedings.neurips.cc/paper/2020/hash/a1140a3d0df1c81e24ae954d935e8926-Abstract.html).
4. Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, Yuxiong He. (2021) ZeRO-Offload: Democratizing Billion-Scale Model Training. [arXiv:2101.06840](https://arxiv.org/abs/2101.06840).
5. Hanlin Tang, Shaoduo Gan, Ammar Ahmad Awan, Samyam Rajbhandari, Conglong Li, Xiangru Lian, Ji Liu, Ce Zhang, Yuxiong He. (2021) 1-bit Adam: Communication Efficient Large-Scale Training with Adam's Convergence Speed. [arXiv:2102.02888](https://arxiv.org/abs/2102.02888) and [ICML 2021](http://proceedings.mlr.press/v139/tang21a.html).
6. Samyam Rajbhandari, Olatunji Ruwase, Jeff Rasley, Shaden Smith, Yuxiong He. (2021) ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning. [arXiv:2104.07857](https://arxiv.org/abs/2104.07857).
7. Conglong Li, Ammar Ahmad Awan, Hanlin Tang, Samyam Rajbhandari, Yuxiong He. (2021) 1-bit LAMB: Communication Efficient Large-Scale Large-Batch Training with LAMB's Convergence Speed. [arXiv:2104.06069](https://arxiv.org/abs/2104.06069).
8. Conglong Li, Minjia Zhang, Yuxiong He. (2021) Curriculum Learning: A Regularization Method for Efficient and Stable Billion-Scale GPT Model Pre-Training. [arXiv:2108.06084](https://arxiv.org/abs/2108.06084).
9. Yucheng Lu, Conglong Li, Minjia Zhang, Christopher De Sa, Yuxiong He. (2022) Maximizing Communication Efficiency for Large-scale Training via 0/1 Adam. [arXiv:2202.06009](https://arxiv.org/abs/2202.06009).
10. Samyam Rajbhandari, Conglong Li, Zhewei Yao, Minjia Zhang, Reza Yazdani Aminabadi, Ammar Ahmad Awan, Jeff Rasley, Yuxiong He. (2022) DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale [arXiv:2201.05596](https://arxiv.org/abs/2201.05596).
11. Shaden Smith, Mostofa Patwary, Brandon Norick, Patrick LeGresley, Samyam Rajbhandari, Jared Casper, Zhun Liu, Shrimai Prabhumoye, George Zerveas, Vijay Korthikanti, Elton Zhang, Rewon Child, Reza Yazdani Aminabadi, Julie Bernauer, Xia Song, Mohammad Shoeybi, Yuxiong He, Michael Houston, Saurabh Tiwary, Bryan Catanzaro. (2022) Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model [arXiv:2201.11990](https://arxiv.org/abs/2201.11990).
12. Xiaoxia Wu, Zhewei Yao, Minjia Zhang, Conglong Li, Yuxiong He. (2022) Extreme Compression for Pre-trained Transformers Made Simple and Efficient. [arXiv:2206.01859](https://arxiv.org/abs/2206.01859).
13. Zhewei Yao, Reza Yazdani Aminabadi, Minjia Zhang, Xiaoxia Wu, Conglong Li, Yuxiong He. (2022) ZeroQuant: Efficient and Affordable Post-Training Quantization for Large-Scale Transformers. [arXiv:2206.01861](https://arxiv.org/abs/2206.01861).
14. Reza Yazdani Aminabadi, Samyam Rajbhandari, Minjia Zhang, Ammar Ahmad Awan, Cheng Li, Du Li, Elton Zheng, Jeff Rasley, Shaden Smith, Olatunji Ruwase, Yuxiong He. (2022) DeepSpeed Inference: Enabling Efficient Inference of Transformer Models at Unprecedented Scale. [arXiv:2207.00032](https://arxiv.org/abs/2207.00032).


# Videos
1. DeepSpeed KDD 2020 Tutorial
    1. [Overview](https://www.youtube.com/watch?v=CaseqC45DNc&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=29)
    2. [ZeRO + large model training](https://www.youtube.com/watch?v=y4_bCiAsIAk&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=28)
    3. [17B T-NLG demo](https://www.youtube.com/watch?v=9V-ZbP92drg&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=27)
    4. [Fastest BERT training + RScan tuning](https://www.youtube.com/watch?v=o1K-ZG9F6u0&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=26)
    5. DeepSpeed hands on deep dive: [part 1](https://www.youtube.com/watch?v=_NOk-mBwDYg&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=92), [part 2](https://www.youtube.com/watch?v=sG6_c4VXLww&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=94), [part 3](https://www.youtube.com/watch?v=k9yPkBTayos&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=93)
    6. [FAQ](https://www.youtube.com/watch?v=nsHu6vEgPew&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=24)
2. Microsoft Research Webinar
    * Registration is free and all videos are available on-demand.
    * [ZeRO & Fastest BERT: Increasing the scale and speed of deep learning training in DeepSpeed](https://note.microsoft.com/MSR-Webinar-DeepSpeed-Registration-On-Demand.html).
3. [DeepSpeed on AzureML](https://youtu.be/yBVXR8G8Bg8)
4. Community Tutorials
    * [DeepSpeed: All the tricks to scale to gigantic models (Mark Saroufim)](https://www.youtube.com/watch?v=pDGI668pNg0)
    * [Turing-NLG, DeepSpeed and the ZeRO optimizer (Yannic Kilcher)](https://www.youtube.com/watch?v=tC01FRB0M7w)
    * [Ultimate Guide To Scaling ML Models (The AI Epiphany)](https://www.youtube.com/watch?v=hc0u4avAkuM)


================================================
FILE: benchmark/third_party/DeepSpeed/SECURITY.md
================================================
<!-- BEGIN MICROSOFT SECURITY.MD V0.0.3 BLOCK -->

## Security

Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).

If you believe you have found a security vulnerability in any Microsoft-owned repository that meets Microsoft's [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)) of a security vulnerability, please report it to us as described below.

## Reporting Security Issues

**Please do not report security vulnerabilities through public GitHub issues.**

Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).

If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).

You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).

Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:

  * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
  * Full paths of source file(s) related to the manifestation of the issue
  * The location of the affected source code (tag/branch/commit or direct URL)
  * Any special configuration required to reproduce the issue
  * Step-by-step instructions to reproduce the issue
  * Proof-of-concept or exploit code (if possible)
  * Impact of the issue, including how an attacker might exploit the issue

This information will help us triage your report more quickly.

If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.

## Preferred Languages

We prefer all communications to be in English.

## Policy

Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).

<!-- END MICROSOFT SECURITY.MD BLOCK -->


================================================
FILE: benchmark/third_party/DeepSpeed/azure/README.md
================================================
# Getting Started with DeepSpeed on Azure

The recommended and simplest method to try DeepSpeed on Azure is through [AzureML](https://azure.microsoft.com/en-us/services/machine-learning/). For more details, please see our [Azure tutorial](https://www.deepspeed.ai/tutorials/azure/).


================================================
FILE: benchmark/third_party/DeepSpeed/benchmarks/__init__.py
================================================


================================================
FILE: benchmark/third_party/DeepSpeed/benchmarks/communication/README.md
================================================
# Running Communication Benchmarks


To run benchmarks, there are two options:

1. Run a single communication operation:

For example, run with a single large message size:
<pre>
deepspeed all_reduce.py
</pre>

Scan across message sizes:
<pre>
deepspeed all_reduce.py --scan
</pre>

2. Run all available communication benchmarks:

<pre>
deepspeed run_all.py
</pre>

Like the individual benchmarks, `run_all.py` supports scanning arguments for the max message size, bw-unit, etc. Simply pass the desired arguments to `run_all.py` and they'll be propagated to each comm op.

<pre>
usage: ds_bench [-h] [--local_rank LOCAL_RANK] [--trials TRIALS] [--warmups WARMUPS] [--maxsize MAXSIZE] [--async-op] [--bw-unit {Gbps,GBps}] [--backend {nccl}] [--dist {deepspeed,torch}] [--scan] [--raw] [--all-reduce] [--all-gather] [--all-to-all]
                [--pt2pt] [--broadcast] [--dtype DTYPE] [--mem-factor MEM_FACTOR] [--debug]

optional arguments:
  -h, --help            show this help message and exit
  --local_rank LOCAL_RANK
  --trials TRIALS       Number of timed iterations
  --warmups WARMUPS     Number of warmup (non-timed) iterations
  --maxsize MAXSIZE     Max message size as a power of 2
  --async-op            Enables non-blocking communication
  --bw-unit {Gbps,GBps}
  --backend {nccl}      Communication library to use
  --dist {deepspeed,torch}
                        Distributed DL framework to use
  --scan                Enables scanning all message sizes
  --raw                 Print the message size and latency without units
  --all-reduce          Run all_reduce
  --all-gather          Run all_gather
  --all-to-all          Run all_to_all
  --pt2pt               Run pt2pt
  --broadcast           Run broadcast
  --dtype DTYPE         PyTorch tensor dtype
  --mem-factor MEM_FACTOR
                        Proportion of max available GPU memory to use for single-size evals
  --debug               Enables all_to_all debug prints
</pre>

Note that `ds_bench` is a pre-packaged wrapper around `run_all.py`. Users can pass the same arguments as well:

<pre>
<path to deepspeed>/bin/ds_bench --scan --trials=10
</pre>

Finally, users can choose specific communication operations to run in `run_all.py` or `ds_bench` by passing them as arguments (all operations are run by default). For example:

<pre>
deepspeed run_all.py --scan --all-reduce --all-to-all --broadcast
</pre>


# Adding Communication Benchmarks

To add new communication benchmarks, follow this general procedure:

1. Copy a similar benchmark file (e.g. to add `reduce_scatter`, copy `all_reduce.py` as a template)
2. Add a new bw formula in `utils.get_bw`, a new maximum tensor element formula in `utils.max_numel`, and a new arg in `utils.benchmark_parser`
3. Replace comm op calls in new file with find-replace
4. Find a good default `mem_factor` for use in `run_<collective>_single()` function
5. Add new comm op to `run_all.py`


================================================
FILE: benchmark/third_party/DeepSpeed/benchmarks/communication/__init__.py
================================================


================================================
FILE: benchmark/third_party/DeepSpeed/benchmarks/communication/all_gather.py
================================================
from benchmarks.communication.utils import *
from benchmarks.communication.constants import *

import time


# Run all_gather and print metrics
def timed_all_gather(input, output, args):
    if args.dist == 'torch':
        import torch.distributed as dist
    elif args.dist == 'deepspeed':
        import deepspeed.comm as dist

    sync_all()
    # Warmups, establish connections, etc.
    for i in range(args.warmups):
        # use all_gather_base if available
        if args.dist == 'torch':
            if hasattr(torch.distributed, "_all_gather_base"):
                dist._all_gather_base(output, input, group=None, async_op=args.async_op)
            else:
                output_tensors = list(
                    torch.chunk(output_tensor,
                                cdb.get_world_size(group)))
                dist.all_gather(output_tensors, input_tensor, group=group, async_op=True)
        elif args.dist == 'deepspeed':
            dist.allgather_fn(output, input, group=None, async_op=args.async_op)
    sync_all()

    # time the actual comm op trials times and average it
    pre = time.perf_counter()
    for i in range(args.trials):
        # use all_gather_base if available
        if args.dist == 'torch':
            if hasattr(torch.distributed, "_all_gather_base"):
                dist._all_gather_base(output, input, group=None, async_op=args.async_op)
            else:
                output_tensors = list(
                    torch.chunk(output_tensor,
                                cdb.get_world_size(group)))
                dist.all_gather(output_tensors, input_tensor, group=group, async_op=True)
        elif args.dist == 'deepspeed':
            dist.allgather_fn(output, input, group=None, async_op=args.async_op)
    sync_all()
    duration = time.perf_counter() - pre

    # maintain and clean performance data
    avg_duration = duration / args.trials
    size = input.element_size() * input.nelement()
    n = dist.get_world_size()
    tput, busbw = get_bw('all_gather', size, avg_duration, args)
    tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration)
    desc = f'{input.nelement()}x{input.element_size()}'

    if not args.raw:
        size = convert_size(size)

    print_rank_0(
        f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")


def run_all_gather(local_rank, args):
    if args.dist == 'torch':
        import torch.distributed as dist
    elif args.dist == 'deepspeed':
        import deepspeed.comm as dist

    # Prepare benchmark header
    print_header(args, 'all_gather')
    global_rank = dist.get_rank()
    world_size = dist.get_world_size()

    if args.scan:
        # Create list of message sizes
        M_LIST = []
        for x in (2**p for p in range(1, args.maxsize)):
            M_LIST.append(x)

        sync_all()
        # loop over various tensor sizes
        for M in M_LIST:
            global_rank = dist.get_rank()
            try:
                mat = torch.ones(world_size,
                                 M,
                                 dtype=getattr(torch,
                                               args.dtype)).cuda(local_rank)
                sync_all()
                input = ((mat.mul_(float(global_rank))).view(-1))
                # Delete original mat to avoid OOM
                del mat
                torch.cuda.empty_cache()
                output = torch.zeros(input.nelement() * world_size,
                                     dtype=getattr(torch,
                                                   args.dtype)).cuda(local_rank)
            except RuntimeError as e:
                if 'out of memory' in str(e):
                    if dist.get_rank() == 0:
                        print('WARNING: Ran out of GPU memory. Exiting comm op.')
                    sync_all()
                    break
            sync_all()
            timed_all_gather(input, output, args)
    else:
        # all_gather_base saves memory
        if (args.dist == 'torch'
                and hasattr(torch.distributed,
                            "_all_gather_base")) or (args.dist == 'deepspeed'
                                                     and dist.has_allgather_base):
            mem_factor = args.mem_factor + 0.2
        else:
            mem_factor = args.mem_factor
        # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor
        sync_all()
        elements_per_gpu = max_numel(comm_op='all_gather',
                                     dtype=getattr(torch,
                                                   args.dtype),
                                     mem_factor=mem_factor,
                                     local_rank=local_rank,
                                     args=args)
        try:
            mat = torch.ones(elements_per_gpu,
                             dtype=getattr(torch,
                                           args.dtype)).cuda(local_rank)
            # multiply each GPU's tensor by the rank to ease debugging
            input = ((mat.mul_(float(global_rank))).view(-1))
            # Delete original mat to avoid OOM
            del mat
            torch.cuda.empty_cache()
            output = torch.zeros(elements_per_gpu * world_size,
                                 dtype=getattr(torch,
                                               args.dtype)).cuda(local_rank)
        except RuntimeError as e:
            if 'out of memory' in str(e):
                if dist.get_rank() == 0:
                    print(
                        'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!'
                    )
                sync_all()
                return

        sync_all()
        timed_all_gather(input, output, args)


if __name__ == "__main__":
    args = benchmark_parser().parse_args()
    rank = args.local_rank
    init_processes(local_rank=rank, args=args)
    run_all_gather(local_rank=rank, args=args)


================================================
FILE: benchmark/third_party/DeepSpeed/benchmarks/communication/all_reduce.py
================================================
from benchmarks.communication.utils import *
from benchmarks.communication.constants import *

import time


def timed_all_reduce(input, args):
    if args.dist == 'torch':
        import torch.distributed as dist
    elif args.dist == 'deepspeed':
        import deepspeed.comm as dist

    sync_all()
    # Warmups, establish connections, etc.
    for i in range(args.warmups):
        dist.all_reduce(input, async_op=args.async_op)
    sync_all()

    # time the actual comm op trials times and average it
    pre = time.perf_counter()
    for i in range(args.trials):
        dist.all_reduce(input, async_op=args.async_op)
    sync_all()
    duration = time.perf_counter() - pre

    # maintain and clean performance data
    avg_duration = duration / args.trials
    size = input.element_size() * input.nelement()
    n = dist.get_world_size()
    tput, busbw = get_bw('all_reduce', size, avg_duration, args)
    tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration)
    desc = f'{input.nelement()}x{input.element_size()}'

    if not args.raw:
        size = convert_size(size)

    print_rank_0(
        f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")


def run_all_reduce(local_rank, args):
    if args.dist == 'torch':
        import torch.distributed as dist
    elif args.dist == 'deepspeed':
        import deepspeed.comm as dist

    # Prepare benchmark header
    print_header(args, 'all_reduce')

    world_size = dist.get_world_size()
    global_rank = dist.get_rank()

    if args.scan:
        M_LIST = []
        for x in (2**p for p in range(1, args.maxsize)):
            M_LIST.append(x)

        sync_all()
        # loop over various tensor sizes
        for M in M_LIST:
            global_rank = dist.get_rank()
            try:
                mat = torch.ones(world_size,
                                 M,
                                 dtype=getattr(torch,
                                               args.dtype)).cuda(local_rank)
                sync_all()
                input = ((mat.mul_(float(global_rank))).view(-1))
            except RuntimeError as e:
                if 'out of memory' in str(e):
                    if dist.get_rank() == 0:
                        print('WARNING: Ran out of GPU memory. Exiting comm op.')
                    sync_all()
                    break
            sync_all()
            timed_all_reduce(input, args)
    else:
        # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor
        # Don't need output tensor, so we double mem_factor
        elements_per_gpu = max_numel(comm_op='all_reduce',
                                     dtype=getattr(torch,
                                                   args.dtype),
                                     mem_factor=args.mem_factor * 2,
                                     local_rank=local_rank,
                                     args=args)
        try:
            mat = torch.ones(elements_per_gpu,
                             dtype=getattr(torch,
                                           args.dtype)).cuda(local_rank)
            input = ((mat.mul_(float(global_rank))).view(-1))
        except RuntimeError as e:
            if 'out of memory' in str(e):
                if dist.get_rank() == 0:
                    print(
                        'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!'
                    )
                sync_all()
                return
        sync_all()
        timed_all_reduce(input, args)


if __name__ == "__main__":
    args = benchmark_parser().parse_args()
    rank = args.local_rank
    init_processes(local_rank=rank, args=args)
    run_all_reduce(local_rank=rank, args=args)


================================================
FILE: benchmark/third_party/DeepSpeed/benchmarks/communication/all_to_all.py
================================================
from benchmarks.communication.utils import *
from benchmarks.communication.constants import *

import time


def timed_all_to_all(input, output, args):
    if args.dist == 'torch':
        import torch.distributed as dist
    elif args.dist == 'deepspeed':
        import deepspeed.comm as dist

    sync_all()
    # Warmups, establish connections, etc.
    for i in range(args.warmups):
        dist.all_to_all_single(output, input, async_op=args.async_op)
    sync_all()

    # time the actual comm op trials times and average it
    pre = time.perf_counter()
    for i in range(args.trials):
        dist.all_to_all_single(output, input, async_op=args.async_op)
    sync_all()
    duration = time.perf_counter() - pre

    # maintain and clean performance data
    avg_duration = duration / args.trials
    size = input.element_size() * input.nelement()
    n = dist.get_world_size()
    tput, busbw = get_bw('all_to_all', size, avg_duration, args)
    tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration)
    desc = f'{input.nelement()}x{input.element_size()}'

    if not args.raw:
        size = convert_size(size)

    print_rank_0(
        f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")


def run_all_to_all(local_rank, args):
    if args.dist == 'torch':
        import torch.distributed as dist
    elif args.dist == 'deepspeed':
        import deepspeed.comm as dist

    world_size = dist.get_world_size()
    global_rank = dist.get_rank()
    # Prepare benchmark header
    print_header(args, 'all_to_all')

    if args.scan:
        M_LIST = []
        for x in (2**p for p in range(1, args.maxsize)):
            M_LIST.append(x)

        sync_all()
        # loop over various tensor sizes
        for M in M_LIST:
            global_rank = dist.get_rank()
            try:
                mat = torch.ones(world_size,
                                 M,
                                 dtype=getattr(torch,
                                               args.dtype)).cuda(local_rank)
                assert mat.numel() % world_size == 0, f"tensor cannot be divided in {world_size} chunks"
                sync_all()
                input = ((mat.mul_(float(global_rank))).view(-1))
                output = (mat.clone().view(-1))
            except RuntimeError as e:
                if 'out of memory' in str(e):
                    if dist.get_rank() == 0:
                        print('WARNING: Ran out of GPU memory. Exiting comm op.')
                    sync_all()
                    break
            sync_all()
            timed_all_to_all(input, output, args)
    else:
        # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor
        elements_per_gpu = max_numel(comm_op='all_to_all',
                                     dtype=getattr(torch,
                                                   args.dtype),
                                     mem_factor=args.mem_factor,
                                     local_rank=local_rank,
                                     args=args)
        try:
            mat = torch.ones(elements_per_gpu,
                             dtype=getattr(torch,
                                           args.dtype)).cuda(local_rank)
            assert mat.numel() % world_size == 0, f"tensor with {mat.numel()} elements cannot be divided in {world_size} chunks"
            input = ((mat.mul_(float(global_rank))).view(-1))
            # Delete original mat to avoid OOM
            del mat
            torch.cuda.empty_cache()
            output = torch.zeros(elements_per_gpu,
                                 dtype=getattr(torch,
                                               args.dtype)).cuda(local_rank)
        except RuntimeError as e:
            if 'out of memory' in str(e):
                if dist.get_rank() == 0:
                    print(
                        'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!'
                    )
                sync_all()
                return
        sync_all()

        if args.debug:
            for i in range(world_size):
                if i == global_rank:
                    print(f"Before AllToAll Input List at rank {global_rank}: {input}")
                dist.barrier()

        timed_all_to_all(input, output, args)

        if args.debug:
            for i in range(world_size):
                if i == global_rank:
                    print(f"AllToAll Results at rank {global_rank}: {output}")
                dist.barrier()


if __name__ == "__main__":
    args = benchmark_parser().parse_args()
    rank = args.local_rank
    init_processes(local_rank=rank, args=args)
    run_all_to_all(local_rank=rank, args=args)


================================================
FILE: benchmark/third_party/DeepSpeed/benchmarks/communication/broadcast.py
================================================
import torch
from benchmarks.communication.utils import *
from benchmarks.communication.constants import *

import time


def timed_broadcast(input, args):
    if args.dist == 'torch':
        import torch.distributed as dist
    elif args.dist == 'deepspeed':
        import deepspeed.comm as dist

    sync_all()
    # Warmups, establish connections, etc.
    for i in range(args.warmups):
        dist.broadcast(input, 0, async_op=args.async_op)
    sync_all()

    # time the actual comm op trials times and average it
    pre = time.perf_counter()
    for i in range(args.trials):
        dist.broadcast(input, 0, async_op=args.async_op)
    sync_all()
    duration = time.perf_counter() - pre

    # maintain and clean performance data
    avg_duration = duration / args.trials
    size = input.element_size() * input.nelement()
    n = dist.get_world_size()
    tput, busbw = get_bw('broadcast', size, avg_duration, args)
    tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration)
    desc = f'{input.nelement()}x{input.element_size()}'

    if not args.raw:
        size = convert_size(size)

    print_rank_0(
        f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")


def run_broadcast(local_rank, args):
    if args.dist == 'torch':
        import torch.distributed as dist
    elif args.dist == 'deepspeed':
        import deepspeed.comm as dist

    # Prepare benchmark header
    print_header(args, 'broadcast')

    world_size = dist.get_world_size()
    global_rank = dist.get_rank()

    if args.scan:
        M_LIST = []
        for x in (2**p for p in range(1, args.maxsize)):
            M_LIST.append(x)

        sync_all()
        # loop over various tensor sizes
        for M in M_LIST:
            global_rank = dist.get_rank()
            try:
                mat = torch.ones(world_size,
                                 M,
                                 dtype=getattr(torch,
                                               args.dtype)).cuda(local_rank)
                sync_all()
                input = ((mat.mul_(float(global_rank))).view(-1))
            except RuntimeError as e:
                if 'out of memory' in str(e):
                    if dist.get_rank() == 0:
                        print('WARNING: Ran out of GPU memory. Exiting comm op.')
                    sync_all()
                    break
            sync_all()
            timed_broadcast(input, args)
    else:
        # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor
        # Don't need output tensor, so we double mem_factor
        elements_per_gpu = max_numel(comm_op='broadcast',
                                     dtype=getattr(torch,
                                                   args.dtype),
                                     mem_factor=args.mem_factor * 2,
                                     local_rank=local_rank,
                                     args=args)
        try:
            mat = torch.ones(elements_per_gpu,
                             dtype=getattr(torch,
                                           args.dtype)).cuda(local_rank)
            input = ((mat.mul_(float(global_rank))).view(-1))
        except RuntimeError as e:
            if 'out of memory' in str(e):
                if dist.get_rank() == 0:
                    print(
                        'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!'
                    )
                sync_all()
                return
        sync_all()
        timed_broadcast(input, args)


if __name__ == "__main__":
    args = benchmark_parser().parse_args()
    rank = args.local_rank
    init_processes(local_rank=rank, args=args)
    run_broadcast(local_rank=rank, args=args)


================================================
FILE: benchmark/third_party/DeepSpeed/benchmarks/communication/constants.py
================================================
DEFAULT_WARMUPS = 5
DEFAULT_TRIALS = 50
DEFAULT_TYPE = 'float'
DEFAULT_BACKEND = 'nccl'
DEFAULT_UNIT = 'Gbps'
DEFAULT_DIST = 'deepspeed'
DEFAULT_MAXSIZE = 24


================================================
FILE: benchmark/third_party/DeepSpeed/benchmarks/communication/pt2pt.py
================================================
from benchmarks.communication.utils import *
from benchmarks.communication.constants import *

import time


def timed_pt2pt(input, args):
    if args.dist == 'torch':
        import torch.distributed as dist
    elif args.dist == 'deepspeed':
        import deepspeed.comm as dist

    sync_all()
    # Warmups, establish connections, etc.
    for i in range(args.warmups):
        if dist.get_rank() == 0:
            if args.async_op:
                dist.isend(input, 1)
            else:
                dist.send(input, 1)
        if dist.get_rank() == 1:
            if args.async_op:
                dist.irecv(input, src=0)
            else:
                dist.recv(input, src=0)
    sync_all()

    # time the actual comm op trials times and average it
    pre = time.perf_counter()
    for i in range(args.trials):
        if dist.get_rank() == 0:
            if args.async_op:
                dist.isend(input, 1)
            else:
                dist.send(input, 1)
        if dist.get_rank() == 1:
            if args.async_op:
                dist.irecv(input, src=0)
            else:
                dist.recv(input, src=0)

    sync_all()
    duration = time.perf_counter() - pre

    # maintain and clean performance data
    avg_duration = duration / args.trials
    size = input.element_size() * input.nelement()
    n = dist.get_world_size()
    tput, busbw = get_bw('pt2pt', size, avg_duration, args)
    tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration)
    desc = f'{input.nelement()}x{input.element_size()}'

    if not args.raw:
        size = convert_size(size)

    print_rank_0(
        f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")


def run_pt2pt(local_rank,

Download .txt

gitextract_3q2i3t76/

├── .gitignore
├── LICENSE
├── README.md
├── benchmark/
│   ├── batch_size_table.md
│   ├── flexgen/
│   │   └── bench_scan_175b.sh
│   ├── flexllmgen/
│   │   ├── README.md
│   │   ├── bench_175b_1x4.sh
│   │   ├── bench_175b_4x1.sh
│   │   ├── bench_30b_1x4.sh
│   │   ├── bench_30b_4x1.sh
│   │   ├── bench_6.7b_1x4.sh
│   │   ├── bench_6.7b_4x1.sh
│   │   ├── bench_dist_multi_node.sh
│   │   ├── bench_dist_single_node.sh
│   │   └── bench_suite.py
│   ├── hf_ds/
│   │   ├── README.md
│   │   ├── bench_all_1x4.sh
│   │   ├── bench_ds_175b_4x1.sh
│   │   ├── bench_ds_30b_1x4.sh
│   │   ├── bench_ds_30b_4x1.sh
│   │   ├── bench_ds_6.7b_1x4.sh
│   │   ├── bench_ds_6.7b_2x1.sh
│   │   ├── bench_ds_6.7b_4x1.sh
│   │   ├── bench_hf.py
│   │   ├── hf_opt.py
│   │   └── hostfile
│   ├── petals/
│   │   ├── README.md
│   │   └── run_opt_requests.py
│   └── third_party/
│       ├── DeepSpeed/
│       │   ├── .clang-format
│       │   ├── .github/
│       │   │   ├── ISSUE_TEMPLATE/
│       │   │   │   ├── compression_bug_report.md
│       │   │   │   ├── feature_request.md
│       │   │   │   ├── inference_bug_report.md
│       │   │   │   └── training_bug_report.md
│       │   │   └── workflows/
│       │   │       ├── amd.yml
│       │   │       ├── formatting.yml
│       │   │       ├── nv-accelerate-v100.yml
│       │   │       ├── nv-inference.yml
│       │   │       ├── nv-lightning-v100.yml
│       │   │       ├── nv-mii.yml
│       │   │       ├── nv-nightly.yml
│       │   │       ├── nv-torch-latest-v100.yml
│       │   │       ├── nv-torch-nightly-v100.yml
│       │   │       ├── nv-torch18-p40.yml
│       │   │       ├── nv-torch18-v100.yml
│       │   │       ├── nv-transformers-v100.yml
│       │   │       ├── pre-compile-ops.yml
│       │   │       └── python.yml
│       │   ├── .gitignore
│       │   ├── .pre-commit-config.yaml
│       │   ├── .pylintrc
│       │   ├── .readthedocs.yml
│       │   ├── .style.yapf
│       │   ├── CODEOWNERS
│       │   ├── CODE_OF_CONDUCT.md
│       │   ├── CONTRIBUTING.md
│       │   ├── LICENSE
│       │   ├── MANIFEST.in
│       │   ├── MANIFEST_win.in
│       │   ├── README.md
│       │   ├── SECURITY.md
│       │   ├── azure/
│       │   │   └── README.md
│       │   ├── benchmarks/
│       │   │   ├── __init__.py
│       │   │   ├── communication/
│       │   │   │   ├── README.md
│       │   │   │   ├── __init__.py
│       │   │   │   ├── all_gather.py
│       │   │   │   ├── all_reduce.py
│       │   │   │   ├── all_to_all.py
│       │   │   │   ├── broadcast.py
│       │   │   │   ├── constants.py
│       │   │   │   ├── pt2pt.py
│       │   │   │   ├── run_all.py
│       │   │   │   └── utils.py
│       │   │   └── inference/
│       │   │       ├── bert-bench.py
│       │   │       ├── collect_results.py
│       │   │       ├── gpt-bench.py
│       │   │       ├── requirements.txt
│       │   │       ├── run_model.sh
│       │   │       └── sweep.sh
│       │   ├── bin/
│       │   │   ├── ds
│       │   │   ├── ds_bench
│       │   │   ├── ds_elastic
│       │   │   ├── ds_report
│       │   │   └── ds_ssh
│       │   ├── build_win.bat
│       │   ├── csrc/
│       │   │   ├── adagrad/
│       │   │   │   └── cpu_adagrad.cpp
│       │   │   ├── adam/
│       │   │   │   ├── cpu_adam.cpp
│       │   │   │   ├── fused_adam_frontend.cpp
│       │   │   │   ├── multi_tensor_adam.cu
│       │   │   │   └── multi_tensor_apply.cuh
│       │   │   ├── aio/
│       │   │   │   ├── common/
│       │   │   │   │   ├── deepspeed_aio_common.cpp
│       │   │   │   │   ├── deepspeed_aio_common.h
│       │   │   │   │   ├── deepspeed_aio_types.cpp
│       │   │   │   │   ├── deepspeed_aio_types.h
│       │   │   │   │   ├── deepspeed_aio_utils.cpp
│       │   │   │   │   └── deepspeed_aio_utils.h
│       │   │   │   ├── py_lib/
│       │   │   │   │   ├── deepspeed_aio_thread.cpp
│       │   │   │   │   ├── deepspeed_aio_thread.h
│       │   │   │   │   ├── deepspeed_py_aio.cpp
│       │   │   │   │   ├── deepspeed_py_aio.h
│       │   │   │   │   ├── deepspeed_py_aio_handle.cpp
│       │   │   │   │   ├── deepspeed_py_aio_handle.h
│       │   │   │   │   ├── deepspeed_py_copy.cpp
│       │   │   │   │   ├── deepspeed_py_copy.h
│       │   │   │   │   └── py_ds_aio.cpp
│       │   │   │   └── py_test/
│       │   │   │       ├── aio_bench_generate_param.py
│       │   │   │       ├── aio_bench_perf_sweep.py
│       │   │   │       ├── ds_aio_basic.py
│       │   │   │       ├── ds_aio_handle.py
│       │   │   │       ├── parse_aio_stats.py
│       │   │   │       ├── perf_sweep_utils.py
│       │   │   │       ├── run_read_sweep.sh
│       │   │   │       ├── run_write_sweep.sh
│       │   │   │       ├── single_process_config.json
│       │   │   │       ├── test_ds_aio.py
│       │   │   │       ├── test_ds_aio_utils.py
│       │   │   │       └── validate_async_io.py
│       │   │   ├── common/
│       │   │   │   └── custom_cuda_kernel.cu
│       │   │   ├── includes/
│       │   │   │   ├── StopWatch.h
│       │   │   │   ├── Timer.h
│       │   │   │   ├── compat.h
│       │   │   │   ├── context.h
│       │   │   │   ├── conversion_utils.h
│       │   │   │   ├── cpu_adagrad.h
│       │   │   │   ├── cpu_adam.h
│       │   │   │   ├── cublas_wrappers.h
│       │   │   │   ├── custom_cuda_layers.h
│       │   │   │   ├── dequantization_utils.h
│       │   │   │   ├── dropout.h
│       │   │   │   ├── ds_kernel_utils.h
│       │   │   │   ├── ds_transformer_cuda.h
│       │   │   │   ├── feed_forward.h
│       │   │   │   ├── gelu.h
│       │   │   │   ├── gemm_test.h
│       │   │   │   ├── general_kernels.h
│       │   │   │   ├── memory_access_utils.h
│       │   │   │   ├── normalize_layer.h
│       │   │   │   ├── quantization.h
│       │   │   │   ├── quantization_utils.h
│       │   │   │   ├── quantizer.h
│       │   │   │   ├── reduction_utils.h
│       │   │   │   ├── simd.h
│       │   │   │   ├── softmax.h
│       │   │   │   ├── strided_batch_gemm.h
│       │   │   │   └── type_shim.h
│       │   │   ├── lamb/
│       │   │   │   ├── fused_lamb_cuda.cpp
│       │   │   │   └── fused_lamb_cuda_kernel.cu
│       │   │   ├── quantization/
│       │   │   │   ├── dequantize.cu
│       │   │   │   ├── fake_quantizer.cu
│       │   │   │   ├── pt_binding.cpp
│       │   │   │   └── quantize.cu
│       │   │   ├── sparse_attention/
│       │   │   │   └── utils.cpp
│       │   │   ├── spatial/
│       │   │   │   ├── csrc/
│       │   │   │   │   ├── opt_bias_add.cu
│       │   │   │   │   └── pt_binding.cpp
│       │   │   │   └── includes/
│       │   │   │       └── spatial_cuda_layers.h
│       │   │   ├── transformer/
│       │   │   │   ├── cublas_wrappers.cu
│       │   │   │   ├── dropout_kernels.cu
│       │   │   │   ├── ds_transformer_cuda.cpp
│       │   │   │   ├── gelu_kernels.cu
│       │   │   │   ├── general_kernels.cu
│       │   │   │   ├── inference/
│       │   │   │   │   ├── csrc/
│       │   │   │   │   │   ├── apply_rotary_pos_emb.cu
│       │   │   │   │   │   ├── dequantize.cu
│       │   │   │   │   │   ├── gelu.cu
│       │   │   │   │   │   ├── layer_norm.cu
│       │   │   │   │   │   ├── pt_binding.cpp
│       │   │   │   │   │   ├── relu.cu
│       │   │   │   │   │   ├── softmax.cu
│       │   │   │   │   │   └── transform.cu
│       │   │   │   │   └── includes/
│       │   │   │   │       ├── inference_context.h
│       │   │   │   │       ├── inference_cublas_wrappers.h
│       │   │   │   │       └── inference_cuda_layers.h
│       │   │   │   ├── normalize_kernels.cu
│       │   │   │   ├── softmax_kernels.cu
│       │   │   │   └── transform_kernels.cu
│       │   │   └── utils/
│       │   │       └── flatten_unflatten.cpp
│       │   ├── deepspeed/
│       │   │   ├── __init__.py
│       │   │   ├── accelerator/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── abstract_accelerator.py
│       │   │   │   ├── cuda_accelerator.py
│       │   │   │   └── real_accelerator.py
│       │   │   ├── autotuning/
│       │   │   │   ├── .gitignore
│       │   │   │   ├── README.md
│       │   │   │   ├── __init__.py
│       │   │   │   ├── autotuner.py
│       │   │   │   ├── config.py
│       │   │   │   ├── config_templates/
│       │   │   │   │   ├── template_zero0.json
│       │   │   │   │   ├── template_zero1.json
│       │   │   │   │   ├── template_zero2.json
│       │   │   │   │   └── template_zero3.json
│       │   │   │   ├── constants.py
│       │   │   │   ├── scheduler.py
│       │   │   │   ├── tuner/
│       │   │   │   │   ├── README.md
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── base_tuner.py
│       │   │   │   │   ├── cost_model.py
│       │   │   │   │   ├── index_based_tuner.py
│       │   │   │   │   ├── model_based_tuner.py
│       │   │   │   │   └── utils.py
│       │   │   │   └── utils.py
│       │   │   ├── checkpoint/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── constants.py
│       │   │   │   ├── deepspeed_checkpoint.py
│       │   │   │   ├── reshape_3d_utils.py
│       │   │   │   ├── reshape_meg_2d.py
│       │   │   │   ├── reshape_utils.py
│       │   │   │   ├── universal_checkpoint.py
│       │   │   │   ├── utils.py
│       │   │   │   └── zero_checkpoint.py
│       │   │   ├── comm/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── backend.py
│       │   │   │   ├── comm.py
│       │   │   │   ├── config.py
│       │   │   │   ├── constants.py
│       │   │   │   ├── torch.py
│       │   │   │   └── utils.py
│       │   │   ├── compression/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── basic_layer.py
│       │   │   │   ├── compress.py
│       │   │   │   ├── config.py
│       │   │   │   ├── constants.py
│       │   │   │   ├── helper.py
│       │   │   │   ├── scheduler.py
│       │   │   │   └── utils.py
│       │   │   ├── constants.py
│       │   │   ├── elasticity/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── config.py
│       │   │   │   ├── constants.py
│       │   │   │   ├── elastic_agent.py
│       │   │   │   ├── elasticity.py
│       │   │   │   └── utils.py
│       │   │   ├── env_report.py
│       │   │   ├── git_version_info.py
│       │   │   ├── inference/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── config.py
│       │   │   │   └── engine.py
│       │   │   ├── launcher/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── constants.py
│       │   │   │   ├── launch.py
│       │   │   │   ├── multinode_runner.py
│       │   │   │   └── runner.py
│       │   │   ├── model_implementations/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── diffusers/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── unet.py
│       │   │   │   │   └── vae.py
│       │   │   │   └── transformers/
│       │   │   │       ├── __init__.py
│       │   │   │       ├── clip_encoder.py
│       │   │   │       └── ds_transformer.py
│       │   │   ├── module_inject/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── inject.py
│       │   │   │   ├── layers.py
│       │   │   │   ├── load_checkpoint.py
│       │   │   │   ├── module_quantize.py
│       │   │   │   ├── replace_module.py
│       │   │   │   └── replace_policy.py
│       │   │   ├── moe/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── experts.py
│       │   │   │   ├── layer.py
│       │   │   │   ├── mappings.py
│       │   │   │   ├── sharded_moe.py
│       │   │   │   └── utils.py
│       │   │   ├── monitor/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── config.py
│       │   │   │   ├── constants.py
│       │   │   │   ├── csv_monitor.py
│       │   │   │   ├── monitor.py
│       │   │   │   ├── tensorboard.py
│       │   │   │   ├── utils.py
│       │   │   │   └── wandb.py
│       │   │   ├── nebula/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── config.py
│       │   │   │   └── constants.py
│       │   │   ├── ops/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── adagrad/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   └── cpu_adagrad.py
│       │   │   │   ├── adam/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── cpu_adam.py
│       │   │   │   │   ├── fused_adam.py
│       │   │   │   │   └── multi_tensor_apply.py
│       │   │   │   ├── aio/
│       │   │   │   │   └── __init__.py
│       │   │   │   ├── lamb/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   └── fused_lamb.py
│       │   │   │   ├── quantizer/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   └── quantizer.py
│       │   │   │   ├── sparse_attention/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── bert_sparse_self_attention.py
│       │   │   │   │   ├── matmul.py
│       │   │   │   │   ├── softmax.py
│       │   │   │   │   ├── sparse_attention_utils.py
│       │   │   │   │   ├── sparse_self_attention.py
│       │   │   │   │   ├── sparsity_config.py
│       │   │   │   │   └── trsrc/
│       │   │   │   │       ├── __init__.py
│       │   │   │   │       ├── matmul.tr
│       │   │   │   │       ├── softmax_bwd.tr
│       │   │   │   │       └── softmax_fwd.tr
│       │   │   │   └── transformer/
│       │   │   │       ├── __init__.py
│       │   │   │       ├── inference/
│       │   │   │       │   ├── __init__.py
│       │   │   │       │   ├── bias_add.py
│       │   │   │       │   ├── config.py
│       │   │   │       │   ├── diffusers_2d_transformer.py
│       │   │   │       │   ├── diffusers_attention.py
│       │   │   │       │   ├── diffusers_transformer_block.py
│       │   │   │       │   ├── ds_attention.py
│       │   │   │       │   ├── ds_mlp.py
│       │   │   │       │   ├── moe_inference.py
│       │   │   │       │   └── triton_ops.py
│       │   │   │       └── transformer.py
│       │   │   ├── pipe/
│       │   │   │   └── __init__.py
│       │   │   ├── profiling/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── config.py
│       │   │   │   ├── constants.py
│       │   │   │   └── flops_profiler/
│       │   │   │       ├── README.md
│       │   │   │       ├── __init__.py
│       │   │   │       └── profiler.py
│       │   │   ├── runtime/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── activation_checkpointing/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── checkpointing.py
│       │   │   │   │   └── config.py
│       │   │   │   ├── bf16_optimizer.py
│       │   │   │   ├── checkpoint_engine/
│       │   │   │   │   ├── README.md
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── checkpoint_engine.py
│       │   │   │   │   ├── nebula_checkpoint_engine.py
│       │   │   │   │   └── torch_checkpoint_engine.py
│       │   │   │   ├── comm/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── coalesced_collectives.py
│       │   │   │   │   ├── mpi.py
│       │   │   │   │   └── nccl.py
│       │   │   │   ├── compression/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   └── cupy.py
│       │   │   │   ├── config.py
│       │   │   │   ├── config_utils.py
│       │   │   │   ├── constants.py
│       │   │   │   ├── data_pipeline/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   └── curriculum_scheduler.py
│       │   │   │   ├── dataloader.py
│       │   │   │   ├── eigenvalue.py
│       │   │   │   ├── engine.py
│       │   │   │   ├── fp16/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── fused_optimizer.py
│       │   │   │   │   ├── loss_scaler.py
│       │   │   │   │   ├── onebit/
│       │   │   │   │   │   ├── __init__.py
│       │   │   │   │   │   ├── adam.py
│       │   │   │   │   │   ├── lamb.py
│       │   │   │   │   │   └── zoadam.py
│       │   │   │   │   └── unfused_optimizer.py
│       │   │   │   ├── lr_schedules.py
│       │   │   │   ├── pipe/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── engine.py
│       │   │   │   │   ├── module.py
│       │   │   │   │   ├── p2p.py
│       │   │   │   │   ├── schedule.py
│       │   │   │   │   └── topology.py
│       │   │   │   ├── progressive_layer_drop.py
│       │   │   │   ├── quantize.py
│       │   │   │   ├── sparse_tensor.py
│       │   │   │   ├── state_dict_factory.py
│       │   │   │   ├── swap_tensor/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── aio_config.py
│       │   │   │   │   ├── async_swapper.py
│       │   │   │   │   ├── constants.py
│       │   │   │   │   ├── optimizer_utils.py
│       │   │   │   │   ├── partitioned_optimizer_swapper.py
│       │   │   │   │   ├── partitioned_param_swapper.py
│       │   │   │   │   ├── pipelined_optimizer_swapper.py
│       │   │   │   │   └── utils.py
│       │   │   │   ├── utils.py
│       │   │   │   ├── weight_quantizer.py
│       │   │   │   └── zero/
│       │   │   │       ├── __init__.py
│       │   │   │       ├── config.py
│       │   │   │       ├── contiguous_memory_allocator.py
│       │   │   │       ├── linear.py
│       │   │   │       ├── offload_config.py
│       │   │   │       ├── parameter_offload.py
│       │   │   │       ├── partition_parameters.py
│       │   │   │       ├── partitioned_param_coordinator.py
│       │   │   │       ├── stage3.py
│       │   │   │       ├── stage_1_and_2.py
│       │   │   │       ├── test.py
│       │   │   │       ├── tiling.py
│       │   │   │       └── utils.py
│       │   │   └── utils/
│       │   │       ├── __init__.py
│       │   │       ├── comms_logging.py
│       │   │       ├── debug.py
│       │   │       ├── exceptions.py
│       │   │       ├── groups.py
│       │   │       ├── init_on_device.py
│       │   │       ├── logging.py
│       │   │       ├── mixed_precision_linkage.py
│       │   │       ├── nvtx.py
│       │   │       ├── tensor_fragment.py
│       │   │       ├── timer.py
│       │   │       ├── types.py
│       │   │       └── zero_to_fp32.py
│       │   ├── docker/
│       │   │   ├── Dockerfile
│       │   │   └── Dockerfile.rocm
│       │   ├── docs/
│       │   │   ├── 404.html
│       │   │   ├── CNAME
│       │   │   ├── Gemfile
│       │   │   ├── README.md
│       │   │   ├── _config.yml
│       │   │   ├── _data/
│       │   │   │   └── navigation.yml
│       │   │   ├── _includes/
│       │   │   │   ├── analytics.html
│       │   │   │   ├── archive-single.html
│       │   │   │   ├── author-profile-custom-links.html
│       │   │   │   ├── author-profile.html
│       │   │   │   ├── breadcrumbs.html
│       │   │   │   ├── browser-upgrade.html
│       │   │   │   ├── category-list.html
│       │   │   │   ├── comment.html
│       │   │   │   ├── comments.html
│       │   │   │   ├── documents-collection.html
│       │   │   │   ├── feature_row
│       │   │   │   ├── figure
│       │   │   │   ├── footer.html
│       │   │   │   ├── gallery
│       │   │   │   ├── group-by-array
│       │   │   │   ├── head.html
│       │   │   │   ├── masthead.html
│       │   │   │   ├── nav_list
│       │   │   │   ├── page__date.html
│       │   │   │   ├── page__hero.html
│       │   │   │   ├── page__hero_video.html
│       │   │   │   ├── page__meta.html
│       │   │   │   ├── page__taxonomy.html
│       │   │   │   ├── paginator.html
│       │   │   │   ├── post_pagination.html
│       │   │   │   ├── posts-category.html
│       │   │   │   ├── posts-tag.html
│       │   │   │   ├── scripts.html
│       │   │   │   ├── seo.html
│       │   │   │   ├── sidebar.html
│       │   │   │   ├── skip-links.html
│       │   │   │   ├── social-share.html
│       │   │   │   ├── tag-list.html
│       │   │   │   ├── toc
│       │   │   │   ├── toc.html
│       │   │   │   └── video
│       │   │   ├── _layouts/
│       │   │   │   └── single-full.html
│       │   │   ├── _pages/
│       │   │   │   ├── compression.md
│       │   │   │   ├── config-json.md
│       │   │   │   ├── inference.md
│       │   │   │   ├── posts-landing.md
│       │   │   │   ├── posts_list_landing.md
│       │   │   │   ├── training.md
│       │   │   │   └── tutorials-landing.md
│       │   │   ├── _posts/
│       │   │   │   ├── 2020-02-13-release.md
│       │   │   │   ├── 2020-02-13-turing-nlg.md
│       │   │   │   ├── 2020-03-17-reduce-scatter.md
│       │   │   │   ├── 2020-05-19-bert-record.md
│       │   │   │   ├── 2020-05-19-press-release.md
│       │   │   │   ├── 2020-05-19-zero-stage2.md
│       │   │   │   ├── 2020-05-28-fastest-bert-training.md
│       │   │   │   ├── 2020-07-24-deepspeed-webinar.md
│       │   │   │   ├── 2020-08-07-webinar-on-demand.md
│       │   │   │   ├── 2020-09-08-sparse-attention-news.md
│       │   │   │   ├── 2020-09-09-ZeRO-Offload.md
│       │   │   │   ├── 2020-09-09-onebit-adam-blog-post.md
│       │   │   │   ├── 2020-09-09-onebit-adam-news.md
│       │   │   │   ├── 2020-09-09-pipeline-parallelism.md
│       │   │   │   ├── 2020-09-09-sparse-attention.md
│       │   │   │   ├── 2020-10-28-progressive-layer-dropping-news.md
│       │   │   │   ├── 2021-03-08-zero3-offload.md
│       │   │   │   ├── 2021-05-05-MoQ.md
│       │   │   │   ├── 2021-05-05-inference-kernel-optimization.md
│       │   │   │   ├── 2021-05-14-inference-release.md
│       │   │   │   ├── 2021-08-18-deepspeed-moe.md
│       │   │   │   ├── 2021-11-15-autotuning.md
│       │   │   │   ├── 2021-12-09-deepspeed-moe-nlg.md
│       │   │   │   ├── 2022-01-19-moe-inference.md
│       │   │   │   ├── 2022-03-21-amd-support.md
│       │   │   │   ├── 2022-07-26-deepspeed-azure.md
│       │   │   │   ├── 2022-09-10-zero-inference.md
│       │   │   │   └── 2022-10-11-mii.md
│       │   │   ├── _sass/
│       │   │   │   ├── button-group.scss
│       │   │   │   ├── minimal-mistakes/
│       │   │   │   │   ├── _archive.scss
│       │   │   │   │   ├── _navigation.scss
│       │   │   │   │   ├── _page.scss
│       │   │   │   │   ├── _sidebar.scss
│       │   │   │   │   ├── _variables.scss
│       │   │   │   │   └── skins/
│       │   │   │   │       └── _air.scss
│       │   │   │   └── minimal-mistakes.scss
│       │   │   ├── _tutorials/
│       │   │   │   ├── MoQ-tutorial.md
│       │   │   │   ├── advanced-install.md
│       │   │   │   ├── autotuning.md
│       │   │   │   ├── azure.md
│       │   │   │   ├── bert-finetuning.md
│       │   │   │   ├── bert-pretraining.md
│       │   │   │   ├── cifar-10.md
│       │   │   │   ├── comms-logging.md
│       │   │   │   ├── curriculum-learning.md
│       │   │   │   ├── flops-profiler.md
│       │   │   │   ├── gan.md
│       │   │   │   ├── getting-started.md
│       │   │   │   ├── inference-tutorial.md
│       │   │   │   ├── large-models-w-deepspeed.md
│       │   │   │   ├── lrrt.md
│       │   │   │   ├── megatron.md
│       │   │   │   ├── mixture-of-experts-inference.md
│       │   │   │   ├── mixture-of-experts-nlg.md
│       │   │   │   ├── mixture-of-experts.md
│       │   │   │   ├── model-compression.md
│       │   │   │   ├── monitor.md
│       │   │   │   ├── one-cycle.md
│       │   │   │   ├── onebit-adam.md
│       │   │   │   ├── onebit-lamb.md
│       │   │   │   ├── pipeline.md
│       │   │   │   ├── progressive_layer_dropping.md
│       │   │   │   ├── pytorch-profiler.md
│       │   │   │   ├── sparse-attention.md
│       │   │   │   ├── transformer_kernel.md
│       │   │   │   ├── zero-offload.md
│       │   │   │   ├── zero-one-adam.md
│       │   │   │   └── zero.md
│       │   │   ├── assets/
│       │   │   │   └── css/
│       │   │   │       └── main.scss
│       │   │   ├── code-docs/
│       │   │   │   ├── Makefile
│       │   │   │   ├── build-api-docs.sh
│       │   │   │   └── source/
│       │   │   │       ├── activation-checkpointing.rst
│       │   │   │       ├── autotuning.rst
│       │   │   │       ├── conf.py
│       │   │   │       ├── flops-profiler.rst
│       │   │   │       ├── index.rst
│       │   │   │       ├── inference-engine.rst
│       │   │   │       ├── inference-init.rst
│       │   │   │       ├── initialize.rst
│       │   │   │       ├── kernel.rst
│       │   │   │       ├── memory.rst
│       │   │   │       ├── model-checkpointing.rst
│       │   │   │       ├── moe.rst
│       │   │   │       ├── optimizers.rst
│       │   │   │       ├── pipeline.rst
│       │   │   │       ├── schedulers.rst
│       │   │   │       ├── training.rst
│       │   │   │       └── zero3.rst
│       │   │   ├── contributing.md
│       │   │   └── index.md
│       │   ├── examples/
│       │   │   └── README.md
│       │   ├── install.sh
│       │   ├── op_builder/
│       │   │   ├── __init__.py
│       │   │   ├── all_ops.py
│       │   │   ├── async_io.py
│       │   │   ├── builder.py
│       │   │   ├── builder_names.py
│       │   │   ├── cpu_adagrad.py
│       │   │   ├── cpu_adam.py
│       │   │   ├── fused_adam.py
│       │   │   ├── fused_lamb.py
│       │   │   ├── quantizer.py
│       │   │   ├── sparse_attn.py
│       │   │   ├── spatial_inference.py
│       │   │   ├── stochastic_transformer.py
│       │   │   ├── transformer.py
│       │   │   ├── transformer_inference.py
│       │   │   └── utils.py
│       │   ├── release/
│       │   │   ├── bump_patch_version.py
│       │   │   └── release.sh
│       │   ├── requirements/
│       │   │   ├── requirements-1bit-mpi.txt
│       │   │   ├── requirements-autotuning-ml.txt
│       │   │   ├── requirements-autotuning.txt
│       │   │   ├── requirements-dev.txt
│       │   │   ├── requirements-inf.txt
│       │   │   ├── requirements-readthedocs.txt
│       │   │   ├── requirements-sd.txt
│       │   │   ├── requirements-sparse_attn.txt
│       │   │   └── requirements.txt
│       │   ├── scripts/
│       │   │   └── check-torchdist.py
│       │   ├── setup.cfg
│       │   ├── setup.py
│       │   ├── tests/
│       │   │   ├── benchmarks/
│       │   │   │   ├── flatten_bench.py
│       │   │   │   └── unflatten_bench.py
│       │   │   ├── conftest.py
│       │   │   ├── lightning/
│       │   │   │   └── test_simple.py
│       │   │   ├── model/
│       │   │   │   ├── BingBertSquad/
│       │   │   │   │   ├── BingBertSquad_run_func_test.py
│       │   │   │   │   ├── BingBertSquad_test_common.py
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── deepspeed_bsz24_fp16_config.json
│       │   │   │   │   ├── deepspeed_bsz24_fp16_eigenvalue_quantize_config.json
│       │   │   │   │   ├── deepspeed_bsz24_fp16_zero2_config.json
│       │   │   │   │   ├── deepspeed_bsz24_fp32_config.json
│       │   │   │   │   ├── run_BingBertSquad.sh
│       │   │   │   │   ├── run_BingBertSquad_sanity.sh
│       │   │   │   │   ├── run_tests.sh
│       │   │   │   │   └── test_e2e_squad.py
│       │   │   │   ├── Megatron_GPT2/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── ds_config_func_bs4_zero1.json
│       │   │   │   │   ├── ds_config_func_bs4_zero2.json
│       │   │   │   │   ├── ds_config_func_bs4_zero2_offload.json
│       │   │   │   │   ├── ds_config_func_bs8_no_zero.json
│       │   │   │   │   ├── ds_config_func_bs8_zero0_gas3.json
│       │   │   │   │   ├── ds_config_func_bs8_zero1.json
│       │   │   │   │   ├── ds_config_func_bs8_zero2.json
│       │   │   │   │   ├── ds_config_func_bs8_zero2_gas3.json
│       │   │   │   │   ├── ds_config_func_bs8_zero2_offload.json
│       │   │   │   │   ├── ds_config_func_scheduler.json
│       │   │   │   │   ├── ds_config_perf_bs16.json
│       │   │   │   │   ├── ds_config_perf_bs32.json
│       │   │   │   │   ├── ds_config_perf_bs8.json
│       │   │   │   │   ├── ds_gpt2_test.sh
│       │   │   │   │   ├── run_checkpoint_test.py
│       │   │   │   │   ├── run_func_test.py
│       │   │   │   │   ├── run_perf_baseline.py
│       │   │   │   │   ├── run_perf_test.py
│       │   │   │   │   └── test_common.py
│       │   │   │   └── run_sanity_check.py
│       │   │   ├── onebit/
│       │   │   │   ├── test_mpi_backend.py
│       │   │   │   ├── test_mpi_perf.py
│       │   │   │   ├── test_nccl_backend.py
│       │   │   │   └── test_nccl_perf.py
│       │   │   ├── perf/
│       │   │   │   ├── adam_test.py
│       │   │   │   └── adam_test1.py
│       │   │   ├── pytest.ini
│       │   │   ├── small_model_debugging/
│       │   │   │   ├── stage3_test.py
│       │   │   │   ├── test.py
│       │   │   │   └── test_model.py
│       │   │   └── unit/
│       │   │       ├── __init__.py
│       │   │       ├── alexnet_model.py
│       │   │       ├── autotuning/
│       │   │       │   └── test_autotuning.py
│       │   │       ├── checkpoint/
│       │   │       │   ├── common.py
│       │   │       │   ├── test_latest_checkpoint.py
│       │   │       │   ├── test_lr_scheduler.py
│       │   │       │   ├── test_moe_checkpoint.py
│       │   │       │   ├── test_other_optimizer.py
│       │   │       │   ├── test_pipeline.py
│       │   │       │   ├── test_reshape_checkpoint.py
│       │   │       │   ├── test_sparse.py
│       │   │       │   ├── test_tag_validation.py
│       │   │       │   └── test_zero_optimizer.py
│       │   │       ├── comm/
│       │   │       │   └── test_dist.py
│       │   │       ├── common.py
│       │   │       ├── compression/
│       │   │       │   └── test_compression.py
│       │   │       ├── ds_batch_config.json
│       │   │       ├── elasticity/
│       │   │       │   └── test_elastic.py
│       │   │       ├── gpt2-merges.txt
│       │   │       ├── gpt2-vocab.json
│       │   │       ├── inference/
│       │   │       │   ├── test_checkpoint_sharding.py
│       │   │       │   ├── test_inference.py
│       │   │       │   ├── test_inference_config.py
│       │   │       │   └── test_model_profiling.py
│       │   │       ├── launcher/
│       │   │       │   ├── test_ds_arguments.py
│       │   │       │   ├── test_multinode_runner.py
│       │   │       │   └── test_run.py
│       │   │       ├── megatron_model.py
│       │   │       ├── model_parallelism/
│       │   │       │   ├── test_configurable_parallel_mp.py
│       │   │       │   └── test_configurable_parallel_pp.py
│       │   │       ├── modeling.py
│       │   │       ├── modelingpreln.py
│       │   │       ├── moe/
│       │   │       │   ├── test_moe.py
│       │   │       │   └── test_moe_tp.py
│       │   │       ├── monitor/
│       │   │       │   └── test_monitor.py
│       │   │       ├── multi_output_model.py
│       │   │       ├── ops/
│       │   │       │   ├── adagrad/
│       │   │       │   │   └── test_cpu_adagrad.py
│       │   │       │   ├── adam/
│       │   │       │   │   ├── test_adamw.py
│       │   │       │   │   └── test_cpu_adam.py
│       │   │       │   ├── aio/
│       │   │       │   │   └── test_aio.py
│       │   │       │   ├── cuda/
│       │   │       │   │   ├── test_cuda_backward.py
│       │   │       │   │   └── test_cuda_forward.py
│       │   │       │   ├── quantizer/
│       │   │       │   │   ├── test_dequantize.py
│       │   │       │   │   ├── test_fake_quantization.py
│       │   │       │   │   └── test_quantize.py
│       │   │       │   ├── sparse_attention/
│       │   │       │   │   └── test_sparse_attention.py
│       │   │       │   ├── spatial/
│       │   │       │   │   └── test_nhwc_bias_add.py
│       │   │       │   └── transformer/
│       │   │       │       └── inference/
│       │   │       │           ├── test_bias_add.py
│       │   │       │           ├── test_bias_geglu.py
│       │   │       │           ├── test_bias_gelu.py
│       │   │       │           ├── test_bias_relu.py
│       │   │       │           ├── test_layer_norm.py
│       │   │       │           ├── test_moe_res_matmult.py
│       │   │       │           └── test_residual_add.py
│       │   │       ├── pipe/
│       │   │       │   └── test_pipe_module.py
│       │   │       ├── profiling/
│       │   │       │   └── flops_profiler/
│       │   │       │       └── test_flops_profiler.py
│       │   │       ├── runtime/
│       │   │       │   ├── activation_checkpointing/
│       │   │       │   │   └── test_activation_checkpointing.py
│       │   │       │   ├── comm/
│       │   │       │   │   └── test_coalesced_collectives.py
│       │   │       │   ├── half_precision/
│       │   │       │   │   ├── onebit/
│       │   │       │   │   │   └── test_onebit.py
│       │   │       │   │   ├── test_bf16.py
│       │   │       │   │   ├── test_dynamic_loss_scale.py
│       │   │       │   │   └── test_fp16.py
│       │   │       │   ├── pipe/
│       │   │       │   │   ├── test_pipe.py
│       │   │       │   │   ├── test_pipe_schedule.py
│       │   │       │   │   └── test_topology.py
│       │   │       │   ├── sparse_tensor/
│       │   │       │   │   ├── test_averaging_sparse_gradients.py
│       │   │       │   │   ├── test_csr.py
│       │   │       │   │   └── test_sparse_grads.py
│       │   │       │   ├── test_autocast.py
│       │   │       │   ├── test_curriculum_learning.py
│       │   │       │   ├── test_data.py
│       │   │       │   ├── test_ds_config_dict.py
│       │   │       │   ├── test_ds_config_model.py
│       │   │       │   ├── test_ds_initialize.py
│       │   │       │   ├── test_lr_schedulers.py
│       │   │       │   ├── test_multi_output_model.py
│       │   │       │   ├── test_pld.py
│       │   │       │   ├── test_runtime_utils.py
│       │   │       │   ├── utils/
│       │   │       │   │   └── test_partition.py
│       │   │       │   └── zero/
│       │   │       │       ├── test_ignore_unused_parameters.py
│       │   │       │       ├── test_zero.py
│       │   │       │       ├── test_zero_config.py
│       │   │       │       ├── test_zero_context.py
│       │   │       │       └── test_zero_tiled.py
│       │   │       ├── simple_model.py
│       │   │       ├── util.py
│       │   │       └── utils/
│       │   │           ├── test_get_optim_files.py
│       │   │           ├── test_groups.py
│       │   │           └── test_init_on_device.py
│       │   └── version.txt
│       ├── README.md
│       ├── pagecache-mangagement/
│       │   ├── .svn/
│       │   │   ├── all-wcprops
│       │   │   └── entries
│       │   ├── README.md
│       │   ├── branches/
│       │   │   └── .svn/
│       │   │       ├── all-wcprops
│       │   │       └── entries
│       │   ├── tags/
│       │   │   └── .svn/
│       │   │       ├── all-wcprops
│       │   │       └── entries
│       │   └── trunk/
│       │       ├── .svn/
│       │       │   ├── all-wcprops
│       │       │   ├── entries
│       │       │   ├── prop-base/
│       │       │   │   ├── benchmar_plain.svn-base
│       │       │   │   ├── benchmar_prepare.svn-base
│       │       │   │   ├── benchmar_qemu.svn-base
│       │       │   │   ├── benchmar_qemu2.svn-base
│       │       │   │   ├── benchmar_squashfs.svn-base
│       │       │   │   ├── benchmar_test.sh.svn-base
│       │       │   │   ├── fadv.sh.svn-base
│       │       │   │   ├── fadv_command.sh.svn-base
│       │       │   │   ├── pagecache-management-fadv.sh.svn-base
│       │       │   │   ├── pagecache-management-ignore-reads.sh.svn-base
│       │       │   │   ├── pagecache-management-lazy200.sh.svn-base
│       │       │   │   ├── pagecache-management-lazy200ir.sh.svn-base
│       │       │   │   ├── pagecache-management-null.sh.svn-base
│       │       │   │   └── pagecache-management.sh.svn-base
│       │       │   └── text-base/
│       │       │       ├── Makefile.svn-base
│       │       │       ├── benchmar_plain.svn-base
│       │       │       ├── benchmar_prepare.svn-base
│       │       │       ├── benchmar_qemu.svn-base
│       │       │       ├── benchmar_qemu2.svn-base
│       │       │       ├── benchmar_squashfs.svn-base
│       │       │       ├── benchmar_test.sh.svn-base
│       │       │       ├── fadv.c.svn-base
│       │       │       ├── fadv.sh.svn-base
│       │       │       ├── fadv_command.sh.svn-base
│       │       │       ├── pagecache-management-fadv.sh.svn-base
│       │       │       ├── pagecache-management-ignore-reads.sh.svn-base
│       │       │       ├── pagecache-management-lazy200.sh.svn-base
│       │       │       ├── pagecache-management-lazy200ir.sh.svn-base
│       │       │       ├── pagecache-management-null.sh.svn-base
│       │       │       ├── pagecache-management.c.svn-base
│       │       │       ├── pagecache-management.sh.svn-base
│       │       │       ├── pagecache-management.txt.svn-base
│       │       │       ├── sfr.c.svn-base
│       │       │       ├── sync_file_range.h.svn-base
│       │       │       └── test.c.svn-base
│       │       ├── benchmar_plain
│       │       ├── benchmar_prepare
│       │       ├── benchmar_qemu
│       │       ├── benchmar_qemu2
│       │       ├── benchmar_squashfs
│       │       ├── benchmar_test.sh
│       │       ├── fadv.c
│       │       ├── fadv.sh
│       │       ├── fadv_command.sh
│       │       ├── pagecache-management-fadv.sh
│       │       ├── pagecache-management-ignore-reads.sh
│       │       ├── pagecache-management-lazy200.sh
│       │       ├── pagecache-management-lazy200ir.sh
│       │       ├── pagecache-management-null.sh
│       │       ├── pagecache-management.c
│       │       ├── pagecache-management.sh
│       │       ├── pagecache-management.txt
│       │       ├── sfr.c
│       │       ├── sync_file_range.h
│       │       └── test.c
│       └── transformers/
│           ├── .circleci/
│           │   ├── TROUBLESHOOT.md
│           │   ├── config.yml
│           │   └── create_circleci_config.py
│           ├── .coveragerc
│           ├── .gitattributes
│           ├── .github/
│           │   ├── ISSUE_TEMPLATE/
│           │   │   ├── bug-report.yml
│           │   │   ├── config.yml
│           │   │   ├── feature-request.yml
│           │   │   ├── migration.yml
│           │   │   └── new-model-addition.yml
│           │   ├── PULL_REQUEST_TEMPLATE.md
│           │   ├── conda/
│           │   │   ├── build.sh
│           │   │   └── meta.yaml
│           │   └── workflows/
│           │       ├── TROUBLESHOOT.md
│           │       ├── add-model-like.yml
│           │       ├── build-docker-images.yml
│           │       ├── build-past-ci-docker-images.yml
│           │       ├── build_documentation.yml
│           │       ├── build_pr_documentation.yml
│           │       ├── check_runner_status.yml
│           │       ├── delete_doc_comment.yml
│           │       ├── doctests.yml
│           │       ├── model-templates.yml
│           │       ├── release-conda.yml
│           │       ├── self-nightly-scheduled.yml
│           │       ├── self-past-caller.yml
│           │       ├── self-past.yml
│           │       ├── self-push-caller.yml
│           │       ├── self-push.yml
│           │       ├── self-scheduled.yml
│           │       ├── stale.yml
│           │       └── update_metdata.yml
│           ├── .gitignore
│           ├── CITATION.cff
│           ├── CODE_OF_CONDUCT.md
│           ├── CONTRIBUTING.md
│           ├── ISSUES.md
│           ├── LICENSE
│           ├── MANIFEST.in
│           ├── Makefile
│           ├── README.md
│           ├── README_es.md
│           ├── README_ko.md
│           ├── README_zh-hans.md
│           ├── README_zh-hant.md
│           ├── conftest.py
│           ├── docker/
│           │   ├── transformers-all-latest-gpu/
│           │   │   └── Dockerfile
│           │   ├── transformers-cpu/
│           │   │   └── Dockerfile
│           │   ├── transformers-doc-builder/
│           │   │   └── Dockerfile
│           │   ├── transformers-gpu/
│           │   │   └── Dockerfile
│           │   ├── transformers-past-gpu/
│           │   │   └── Dockerfile
│           │   ├── transformers-pytorch-cpu/
│           │   │   └── Dockerfile
│           │   ├── transformers-pytorch-deepspeed-latest-gpu/
│           │   │   └── Dockerfile
│           │   ├── transformers-pytorch-deepspeed-nightly-gpu/
│           │   │   └── Dockerfile
│           │   ├── transformers-pytorch-gpu/
│           │   │   └── Dockerfile
│           │   ├── transformers-pytorch-tpu/
│           │   │   ├── Dockerfile
│           │   │   ├── bert-base-cased.jsonnet
│           │   │   ├── dataset.yaml
│           │   │   └── docker-entrypoint.sh
│           │   ├── transformers-tensorflow-cpu/
│           │   │   └── Dockerfile
│           │   └── transformers-tensorflow-gpu/
│           │       └── Dockerfile
│           ├── docs/
│           │   ├── README.md
│           │   ├── TRANSLATING.md
│           │   └── source/
│           │       ├── _config.py
│           │       ├── de/
│           │       │   ├── _config.py
│           │       │   ├── _toctree.yml
│           │       │   ├── accelerate.mdx
│           │       │   ├── autoclass_tutorial.mdx
│           │       │   ├── index.mdx
│           │       │   ├── installation.mdx
│           │       │   ├── model_sharing.mdx
│           │       │   ├── pipeline_tutorial.mdx
│           │       │   ├── preprocessing.mdx
│           │       │   ├── quicktour.mdx
│           │       │   └── training.mdx
│           │       ├── en/
│           │       │   ├── _config.py
│           │       │   ├── _toctree.yml
│           │       │   ├── accelerate.mdx
│           │       │   ├── add_new_model.mdx
│           │       │   ├── add_new_pipeline.mdx
│           │       │   ├── add_tensorflow_model.mdx
│           │       │   ├── autoclass_tutorial.mdx
│           │       │   ├── benchmarks.mdx
│           │       │   ├── bertology.mdx
│           │       │   ├── big_models.mdx
│           │       │   ├── community.mdx
│           │       │   ├── converting_tensorflow_models.mdx
│           │       │   ├── create_a_model.mdx
│           │       │   ├── custom_models.mdx
│           │       │   ├── debugging.mdx
│           │       │   ├── fast_tokenizers.mdx
│           │       │   ├── glossary.mdx
│           │       │   ├── hpo_train.mdx
│           │       │   ├── index.mdx
│           │       │   ├── installation.mdx
│           │       │   ├── internal/
│           │       │   │   ├── file_utils.mdx
│           │       │   │   ├── generation_utils.mdx
│           │       │   │   ├── image_processing_utils.mdx
│           │       │   │   ├── modeling_utils.mdx
│           │       │   │   ├── pipelines_utils.mdx
│           │       │   │   ├── tokenization_utils.mdx
│           │       │   │   └── trainer_utils.mdx
│           │       │   ├── main_classes/
│           │       │   │   ├── callback.mdx
│           │       │   │   ├── configuration.mdx
│           │       │   │   ├── data_collator.mdx
│           │       │   │   ├── deepspeed.mdx
│           │       │   │   ├── feature_extractor.mdx
│           │       │   │   ├── keras_callbacks.mdx
│           │       │   │   ├── logging.mdx
│           │       │   │   ├── model.mdx
│           │       │   │   ├── onnx.mdx
│           │       │   │   ├── optimizer_schedules.mdx
│           │       │   │   ├── output.mdx
│           │       │   │   ├── pipelines.mdx
│           │       │   │   ├── processors.mdx
│           │       │   │   ├── text_generation.mdx
│           │       │   │   ├── tokenizer.mdx
│           │       │   │   └── trainer.mdx
│           │       │   ├── migration.mdx
│           │       │   ├── model_doc/
│           │       │   │   ├── albert.mdx
│           │       │   │   ├── auto.mdx
│           │       │   │   ├── bart.mdx
│           │       │   │   ├── barthez.mdx
│           │       │   │   ├── bartpho.mdx
│           │       │   │   ├── beit.mdx
│           │       │   │   ├── bert-generation.mdx
│           │       │   │   ├── bert-japanese.mdx
│           │       │   │   ├── bert.mdx
│           │       │   │   ├── bertweet.mdx
│           │       │   │   ├── big_bird.mdx
│           │       │   │   ├── bigbird_pegasus.mdx
│           │       │   │   ├── blenderbot-small.mdx
│           │       │   │   ├── blenderbot.mdx
│           │       │   │   ├── bloom.mdx
│           │       │   │   ├── bort.mdx
│           │       │   │   ├── byt5.mdx
│           │       │   │   ├── camembert.mdx
│           │       │   │   ├── canine.mdx
│           │       │   │   ├── clip.mdx
│           │       │   │   ├── codegen.mdx
│           │       │   │   ├── conditional_detr.mdx
│           │       │   │   ├── convbert.mdx
│           │       │   │   ├── convnext.mdx
│           │       │   │   ├── cpm.mdx
│           │       │   │   ├── ctrl.mdx
│           │       │   │   ├── cvt.mdx
│           │       │   │   ├── data2vec.mdx
│           │       │   │   ├── deberta-v2.mdx
│           │       │   │   ├── deberta.mdx
│           │       │   │   ├── decision_transformer.mdx
│           │       │   │   ├── deformable_detr.mdx
│           │       │   │   ├── deit.mdx
│           │       │   │   ├── detr.mdx
│           │       │   │   ├── dialogpt.mdx
│           │       │   │   ├── distilbert.mdx
│           │       │   │   ├── dit.mdx
│           │       │   │   ├── donut.mdx
│           │       │   │   ├── dpr.mdx
│           │       │   │   ├── dpt.mdx
│           │       │   │   ├── electra.mdx
│           │       │   │   ├── encoder-decoder.mdx
│           │       │   │   ├── ernie.mdx
│           │       │   │   ├── esm.mdx
│           │       │   │   ├── flan-t5.mdx
│           │       │   │   ├── flaubert.mdx
│           │       │   │   ├── flava.mdx
│           │       │   │   ├── fnet.mdx
│           │       │   │   ├── fsmt.mdx
│           │       │   │   ├── funnel.mdx
│           │       │   │   ├── glpn.mdx
│           │       │   │   ├── gpt2.mdx
│           │       │   │   ├── gpt_neo.mdx
│           │       │   │   ├── gpt_neox.mdx
│           │       │   │   ├── gpt_neox_japanese.mdx
│           │       │   │   ├── gptj.mdx
│           │       │   │   ├── groupvit.mdx
│           │       │   │   ├── herbert.mdx
│           │       │   │   ├── hubert.mdx
│           │       │   │   ├── ibert.mdx
│           │       │   │   ├── imagegpt.mdx
│           │       │   │   ├── layoutlm.mdx
│           │       │   │   ├── layoutlmv2.mdx
│           │       │   │   ├── layoutlmv3.mdx
│           │       │   │   ├── layoutxlm.mdx
│           │       │   │   ├── led.mdx
│           │       │   │   ├── levit.mdx
│           │       │   │   ├── lilt.mdx
│           │       │   │   ├── longformer.mdx
│           │       │   │   ├── longt5.mdx
│           │       │   │   ├── luke.mdx
│           │       │   │   ├── lxmert.mdx
│           │       │   │   ├── m2m_100.mdx
│           │       │   │   ├── marian.mdx
│           │       │   │   ├── markuplm.mdx
│           │       │   │   ├── maskformer.mdx
│           │       │   │   ├── mbart.mdx
│           │       │   │   ├── mctct.mdx
│           │       │   │   ├── megatron-bert.mdx
│           │       │   │   ├── megatron_gpt2.mdx
│           │       │   │   ├── mluke.mdx
│           │       │   │   ├── mobilebert.mdx
│           │       │   │   ├── mobilevit.mdx
│           │       │   │   ├── mpnet.mdx
│           │       │   │   ├── mt5.mdx
│           │       │   │   ├── mvp.mdx
│           │       │   │   ├── nezha.mdx
│           │       │   │   ├── nllb.mdx
│           │       │   │   ├── nystromformer.mdx
│           │       │   │   ├── openai-gpt.mdx
│           │       │   │   ├── opt.mdx
│           │       │   │   ├── owlvit.mdx
│           │       │   │   ├── pegasus.mdx
│           │       │   │   ├── pegasus_x.mdx
│           │       │   │   ├── perceiver.mdx
│           │       │   │   ├── phobert.mdx
│           │       │   │   ├── plbart.mdx
│           │       │   │   ├── poolformer.mdx
│           │       │   │   ├── prophetnet.mdx
│           │       │   │   ├── qdqbert.mdx
│           │       │   │   ├── rag.mdx
│           │       │   │   ├── realm.mdx
│           │       │   │   ├── reformer.mdx
│           │       │   │   ├── regnet.mdx
│           │       │   │   ├── rembert.mdx
│           │       │   │   ├── resnet.mdx
│           │       │   │   ├── retribert.mdx
│           │       │   │   ├── roberta.mdx
│           │       │   │   ├── roformer.mdx
│           │       │   │   ├── segformer.mdx
│           │       │   │   ├── sew-d.mdx
│           │       │   │   ├── sew.mdx
│           │       │   │   ├── speech-encoder-decoder.mdx
│           │       │   │   ├── speech_to_text.mdx
│           │       │   │   ├── speech_to_text_2.mdx
│           │       │   │   ├── splinter.mdx
│           │       │   │   ├── squeezebert.mdx
│           │       │   │   ├── swin.mdx
│           │       │   │   ├── swinv2.mdx
│           │       │   │   ├── t5.mdx
│           │       │   │   ├── t5v1.1.mdx
│           │       │   │   ├── table-transformer.mdx
│           │       │   │   ├── tapas.mdx
│           │       │   │   ├── tapex.mdx
│           │       │   │   ├── time_series_transformer.mdx
│           │       │   │   ├── trajectory_transformer.mdx
│           │       │   │   ├── transfo-xl.mdx
│           │       │   │   ├── trocr.mdx
│           │       │   │   ├── ul2.mdx
│           │       │   │   ├── unispeech-sat.mdx
│           │       │   │   ├── unispeech.mdx
│           │       │   │   ├── van.mdx
│           │       │   │   ├── videomae.mdx
│           │       │   │   ├── vilt.mdx
│           │       │   │   ├── vision-encoder-decoder.mdx
│           │       │   │   ├── vision-text-dual-encoder.mdx
│           │       │   │   ├── visual_bert.mdx
│           │       │   │   ├── vit.mdx
│           │       │   │   ├── vit_mae.mdx
│           │       │   │   ├── vit_msn.mdx
│           │       │   │   ├── wav2vec2-conformer.mdx
│           │       │   │   ├── wav2vec2.mdx
│           │       │   │   ├── wav2vec2_phoneme.mdx
│           │       │   │   ├── wavlm.mdx
│           │       │   │   ├── whisper.mdx
│           │       │   │   ├── xclip.mdx
│           │       │   │   ├── xglm.mdx
│           │       │   │   ├── xlm-prophetnet.mdx
│           │       │   │   ├── xlm-roberta-xl.mdx
│           │       │   │   ├── xlm-roberta.mdx
│           │       │   │   ├── xlm.mdx
│           │       │   │   ├── xlnet.mdx
│           │       │   │   ├── xls_r.mdx
│           │       │   │   ├── xlsr_wav2vec2.mdx
│           │       │   │   ├── yolos.mdx
│           │       │   │   └── yoso.mdx
│           │       │   ├── model_sharing.mdx
│           │       │   ├── model_summary.mdx
│           │       │   ├── multilingual.mdx
│           │       │   ├── pad_truncation.mdx
│           │       │   ├── perf_hardware.mdx
│           │       │   ├── perf_infer_cpu.mdx
│           │       │   ├── perf_infer_gpu_many.mdx
│           │       │   ├── perf_infer_gpu_one.mdx
│           │       │   ├── perf_infer_special.mdx
│           │       │   ├── perf_train_cpu.mdx
│           │       │   ├── perf_train_cpu_many.mdx
│           │       │   ├── perf_train_gpu_many.mdx
│           │       │   ├── perf_train_gpu_one.mdx
│           │       │   ├── perf_train_special.mdx
│           │       │   ├── perf_train_tpu.mdx
│           │       │   ├── performance.mdx
│           │       │   ├── perplexity.mdx
│           │       │   ├── philosophy.mdx
│           │       │   ├── pipeline_tutorial.mdx
│           │       │   ├── pr_checks.mdx
│           │       │   ├── preprocessing.mdx
│           │       │   ├── quicktour.mdx
│           │       │   ├── run_scripts.mdx
│           │       │   ├── sagemaker.mdx
│           │       │   ├── serialization.mdx
│           │       │   ├── task_summary.mdx
│           │       │   ├── tasks/
│           │       │   │   ├── asr.mdx
│           │       │   │   ├── audio_classification.mdx
│           │       │   │   ├── image_classification.mdx
│           │       │   │   ├── language_modeling.mdx
│           │       │   │   ├── multiple_choice.mdx
│           │       │   │   ├── question_answering.mdx
│           │       │   │   ├── semantic_segmentation.mdx
│           │       │   │   ├── sequence_classification.mdx
│           │       │   │   ├── summarization.mdx
│           │       │   │   ├── token_classification.mdx
│           │       │   │   └── translation.mdx
│           │       │   ├── testing.mdx
│           │       │   ├── tokenizer_summary.mdx
│           │       │   ├── torchscript.mdx
│           │       │   ├── training.mdx
│           │       │   └── troubleshooting.mdx
│           │       ├── es/
│           │       │   ├── _config.py
│           │       │   ├── _toctree.yml
│           │       │   ├── accelerate.mdx
│           │       │   ├── autoclass_tutorial.mdx
│           │       │   ├── bertology.mdx
│           │       │   ├── converting_tensorflow_models.mdx
│           │       │   ├── create_a_model.mdx
│           │       │   ├── custom_models.mdx
│           │       │   ├── fast_tokenizers.mdx
│           │       │   ├── index.mdx
│           │       │   ├── installation.mdx
│           │       │   ├── model_sharing.mdx
│           │       │   ├── multilingual.mdx
│           │       │   ├── philosophy.mdx
│           │       │   ├── pipeline_tutorial.mdx
│           │       │   ├── preprocessing.mdx
│           │       │   ├── quicktour.mdx
│           │       │   ├── run_scripts.mdx
│           │       │   ├── sagemaker.mdx
│           │       │   ├── tasks/
│           │       │   │   ├── image_classification.mdx
│           │       │   │   ├── language_modeling.mdx
│           │       │   │   ├── multiple_choice.mdx
│           │       │   │   ├── question_answering.mdx
│           │       │   │   └── summarization.mdx
│           │       │   └── training.mdx
│           │       ├── it/
│           │       │   ├── _config.py
│           │       │   ├── _toctree.yml
│           │       │   ├── accelerate.mdx
│           │       │   ├── add_new_model.mdx
│           │       │   ├── add_new_pipeline.mdx
│           │       │   ├── autoclass_tutorial.mdx
│           │       │   ├── converting_tensorflow_models.mdx
│           │       │   ├── create_a_model.mdx
│           │       │   ├── custom_models.mdx
│           │       │   ├── debugging.mdx
│           │       │   ├── index.mdx
│           │       │   ├── installation.mdx
│           │       │   ├── model_sharing.mdx
│           │       │   ├── multilingual.mdx
│           │       │   ├── perf_hardware.mdx
│           │       │   ├── pipeline_tutorial.mdx
│           │       │   ├── preprocessing.mdx
│           │       │   ├── quicktour.mdx
│           │       │   ├── run_scripts.mdx
│           │       │   ├── serialization.mdx
│           │       │   └── training.mdx
│           │       └── pt/
│           │           ├── _config.py
│           │           ├── _toctree.yml
│           │           ├── accelerate.mdx
│           │           ├── converting_tensorflow_models.mdx
│           │           ├── create_a_model.mdx
│           │           ├── custom_models.mdx
│           │           ├── fast_tokenizers.mdx
│           │           ├── index.mdx
│           │           ├── installation.mdx
│           │           ├── multilingual.mdx
│           │           ├── pipeline_tutorial.mdx
│           │           ├── quicktour.mdx
│           │           ├── run_scripts.mdx
│           │           ├── serialization.mdx
│           │           ├── tasks/
│           │           │   ├── sequence_classification.mdx
│           │           │   └── token_classification.mdx
│           │           └── training.mdx
│           ├── examples/
│           │   ├── README.md
│           │   ├── flax/
│           │   │   ├── README.md
│           │   │   ├── _tests_requirements.txt
│           │   │   ├── conftest.py
│           │   │   ├── image-captioning/
│           │   │   │   ├── README.md
│           │   │   │   ├── create_model_from_encoder_decoder_models.py
│           │   │   │   └── run_image_captioning_flax.py
│           │   │   ├── language-modeling/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_bart_dlm_flax.py
│           │   │   │   ├── run_clm_flax.py
│           │   │   │   ├── run_mlm_flax.py
│           │   │   │   ├── run_t5_mlm_flax.py
│           │   │   │   └── t5_tokenizer_model.py
│           │   │   ├── question-answering/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_qa.py
│           │   │   │   └── utils_qa.py
│           │   │   ├── summarization/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   └── run_summarization_flax.py
│           │   │   ├── test_flax_examples.py
│           │   │   ├── text-classification/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   └── run_flax_glue.py
│           │   │   ├── token-classification/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   └── run_flax_ner.py
│           │   │   └── vision/
│           │   │       ├── README.md
│           │   │       ├── requirements.txt
│           │   │       └── run_image_classification.py
│           │   ├── legacy/
│           │   │   ├── README.md
│           │   │   ├── multiple_choice/
│           │   │   │   ├── run_multiple_choice.py
│           │   │   │   └── utils_multiple_choice.py
│           │   │   ├── pytorch-lightning/
│           │   │   │   ├── lightning_base.py
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_glue.py
│           │   │   │   ├── run_glue.sh
│           │   │   │   ├── run_ner.py
│           │   │   │   ├── run_ner.sh
│           │   │   │   └── run_pos.sh
│           │   │   ├── question-answering/
│           │   │   │   ├── README.md
│           │   │   │   ├── run_squad.py
│           │   │   │   └── run_squad_trainer.py
│           │   │   ├── run_camembert.py
│           │   │   ├── run_chinese_ref.py
│           │   │   ├── run_language_modeling.py
│           │   │   ├── run_openai_gpt.py
│           │   │   ├── run_swag.py
│           │   │   ├── run_transfo_xl.py
│           │   │   ├── seq2seq/
│           │   │   │   ├── README.md
│           │   │   │   ├── __init__.py
│           │   │   │   ├── convert_model_to_fp16.py
│           │   │   │   ├── download_wmt.py
│           │   │   │   ├── finetune.sh
│           │   │   │   ├── finetune_tpu.sh
│           │   │   │   ├── finetune_trainer.py
│           │   │   │   ├── minify_dataset.py
│           │   │   │   ├── old_test_calculate_rouge.py
│           │   │   │   ├── old_test_datasets.py
│           │   │   │   ├── old_test_fsmt_bleu_score.py
│           │   │   │   ├── old_test_seq2seq_examples.py
│           │   │   │   ├── old_test_seq2seq_examples_multi_gpu.py
│           │   │   │   ├── old_test_tatoeba_conversion.py
│           │   │   │   ├── pack_dataset.py
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── romanian_postprocessing.md
│           │   │   │   ├── rouge_cli.py
│           │   │   │   ├── run_distributed_eval.py
│           │   │   │   ├── run_eval.py
│           │   │   │   ├── run_eval_search.py
│           │   │   │   ├── save_len_file.py
│           │   │   │   ├── save_randomly_initialized_model.py
│           │   │   │   ├── sentence_splitter.py
│           │   │   │   ├── seq2seq_trainer.py
│           │   │   │   ├── seq2seq_training_args.py
│           │   │   │   ├── test_data/
│           │   │   │   │   ├── fsmt/
│           │   │   │   │   │   ├── build-eval-data.py
│           │   │   │   │   │   └── fsmt_val_data.json
│           │   │   │   │   └── wmt_en_ro/
│           │   │   │   │       ├── test.source
│           │   │   │   │       ├── test.target
│           │   │   │   │       ├── train.len
│           │   │   │   │       ├── train.source
│           │   │   │   │       ├── train.target
│           │   │   │   │       ├── val.len
│           │   │   │   │       ├── val.source
│           │   │   │   │       └── val.target
│           │   │   │   ├── train_distil_marian_enro.sh
│           │   │   │   ├── train_distil_marian_enro_tpu.sh
│           │   │   │   ├── train_distilbart_cnn.sh
│           │   │   │   ├── train_mbart_cc25_enro.sh
│           │   │   │   ├── utils.py
│           │   │   │   └── xla_spawn.py
│           │   │   ├── text-classification/
│           │   │   │   └── run_tf_text_classification.py
│           │   │   └── token-classification/
│           │   │       ├── README.md
│           │   │       ├── run.sh
│           │   │       ├── run_chunk.sh
│           │   │       ├── run_ner.py
│           │   │       ├── run_pos.sh
│           │   │       ├── run_tf_ner.py
│           │   │       ├── scripts/
│           │   │       │   └── preprocess.py
│           │   │       ├── tasks.py
│           │   │       └── utils_ner.py
│           │   ├── pytorch/
│           │   │   ├── README.md
│           │   │   ├── _tests_requirements.txt
│           │   │   ├── audio-classification/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   └── run_audio_classification.py
│           │   │   ├── benchmarking/
│           │   │   │   ├── README.md
│           │   │   │   ├── plot_csv_file.py
│           │   │   │   ├── requirements.txt
│           │   │   │   └── run_benchmark.py
│           │   │   ├── conftest.py
│           │   │   ├── contrastive-image-text/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   └── run_clip.py
│           │   │   ├── image-classification/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_image_classification.py
│           │   │   │   └── run_image_classification_no_trainer.py
│           │   │   ├── image-pretraining/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_mae.py
│           │   │   │   └── run_mim.py
│           │   │   ├── language-modeling/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_clm.py
│           │   │   │   ├── run_clm_no_trainer.py
│           │   │   │   ├── run_mlm.py
│           │   │   │   ├── run_mlm_no_trainer.py
│           │   │   │   └── run_plm.py
│           │   │   ├── multiple-choice/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_no_trainer.sh
│           │   │   │   ├── run_swag.py
│           │   │   │   └── run_swag_no_trainer.py
│           │   │   ├── question-answering/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_qa.py
│           │   │   │   ├── run_qa_beam_search.py
│           │   │   │   ├── run_qa_beam_search_no_trainer.py
│           │   │   │   ├── run_qa_no_trainer.py
│           │   │   │   ├── run_seq2seq_qa.py
│           │   │   │   ├── trainer_qa.py
│           │   │   │   ├── trainer_seq2seq_qa.py
│           │   │   │   └── utils_qa.py
│           │   │   ├── semantic-segmentation/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_semantic_segmentation.py
│           │   │   │   └── run_semantic_segmentation_no_trainer.py
│           │   │   ├── speech-pretraining/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   └── run_wav2vec2_pretraining_no_trainer.py
│           │   │   ├── speech-recognition/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_speech_recognition_ctc.py
│           │   │   │   └── run_speech_recognition_seq2seq.py
│           │   │   ├── summarization/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_summarization.py
│           │   │   │   └── run_summarization_no_trainer.py
│           │   │   ├── test_accelerate_examples.py
│           │   │   ├── test_pytorch_examples.py
│           │   │   ├── test_xla_examples.py
│           │   │   ├── text-classification/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_glue.py
│           │   │   │   ├── run_glue_no_trainer.py
│           │   │   │   └── run_xnli.py
│           │   │   ├── text-generation/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_generation.py
│           │   │   │   └── run_generation_contrastive_search.py
│           │   │   ├── token-classification/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run.sh
│           │   │   │   ├── run_ner.py
│           │   │   │   ├── run_ner_no_trainer.py
│           │   │   │   └── run_no_trainer.sh
│           │   │   ├── translation/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_translation.py
│           │   │   │   └── run_translation_no_trainer.py
│           │   │   └── xla_spawn.py
│           │   ├── research_projects/
│           │   │   ├── README.md
│           │   │   ├── adversarial/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_hans.py
│           │   │   │   └── utils_hans.py
│           │   │   ├── bert-loses-patience/
│           │   │   │   ├── README.md
│           │   │   │   ├── pabee/
│           │   │   │   │   ├── __init__.py
│           │   │   │   │   ├── modeling_pabee_albert.py
│           │   │   │   │   └── modeling_pabee_bert.py
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_glue_with_pabee.py
│           │   │   │   └── test_run_glue_with_pabee.py
│           │   │   ├── bertabs/
│           │   │   │   ├── README.md
│           │   │   │   ├── __init__.py
│           │   │   │   ├── configuration_bertabs.py
│           │   │   │   ├── convert_bertabs_original_pytorch_checkpoint.py
│           │   │   │   ├── modeling_bertabs.py
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_summarization.py
│           │   │   │   ├── test_utils_summarization.py
│           │   │   │   └── utils_summarization.py
│           │   │   ├── bertology/
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_bertology.py
│           │   │   │   └── run_prune_gpt.py
│           │   │   ├── codeparrot/
│           │   │   │   ├── README.md
│           │   │   │   ├── examples/
│           │   │   │   │   ├── README.md
│           │   │   │   │   ├── requirements.txt
│           │   │   │   │   └── train_complexity_predictor.py
│           │   │   │   ├── requirements.txt
│           │   │   │   └── scripts/
│           │   │   │       ├── arguments.py
│           │   │   │       ├── bpe_training.py
│           │   │   │       ├── codeparrot_training.py
│           │   │   │       ├── human_eval.py
│           │   │   │       ├── initialize_model.py
│           │   │   │       ├── minhash_deduplication.py
│           │   │   │       ├── preprocessing.py
│           │   │   │       ├── pretokenizing.py
│           │   │   │       ├── tests/
│           │   │   │       │   ├── __init__.py
│           │   │   │       │   └── test_deduplicate.py
│           │   │   │       └── validation_loss.py
│           │   │   ├── decision_transformer/
│           │   │   │   ├── requirements.txt
│           │   │   │   └── run_decision_transformer.py
│           │   │   ├── deebert/
│           │   │   │   ├── README.md
│           │   │   │   ├── entropy_eval.sh
│           │   │   │   ├── eval_deebert.sh
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_glue_deebert.py
│           │   │   │   ├── src/
│           │   │   │   │   ├── __init__.py
│           │   │   │   │   ├── modeling_highway_bert.py
│           │   │   │   │   └── modeling_highway_roberta.py
│           │   │   │   ├── test_glue_deebert.py
│           │   │   │   └── train_deebert.sh
│           │   │   ├── distillation/
│           │   │   │   ├── README.md
│           │   │   │   ├── distiller.py
│           │   │   │   ├── grouped_batch_sampler.py
│           │   │   │   ├── lm_seqs_dataset.py
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_squad_w_distillation.py
│           │   │   │   ├── scripts/
│           │   │   │   │   ├── binarized_data.py
│           │   │   │   │   ├── extract.py
│           │   │   │   │   ├── extract_distilbert.py
│           │   │   │   │   └── token_counts.py
│           │   │   │   ├── train.py
│           │   │   │   ├── training_configs/
│           │   │   │   │   ├── distilbert-base-cased.json
│           │   │   │   │   ├── distilbert-base-multilingual-cased.json
│           │   │   │   │   ├── distilbert-base-uncased.json
│           │   │   │   │   ├── distilgpt2.json
│           │   │   │   │   └── distilroberta-base.json
│           │   │   │   └── utils.py
│           │   │   ├── fsner/
│           │   │   │   ├── README.md
│           │   │   │   ├── pyproject.toml
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── setup.py
│           │   │   │   └── src/
│           │   │   │       └── fsner/
│           │   │   │           ├── __init__.py
│           │   │   │           ├── model.py
│           │   │   │           └── tokenizer_utils.py
│           │   │   ├── information-gain-filtration/
│           │   │   │   ├── README.md
│           │   │   │   ├── igf/
│           │   │   │   │   ├── __init__.py
│           │   │   │   │   └── igf.py
│           │   │   │   ├── requirements.txt
│           │   │   │   └── run_clm_igf.py
│           │   │   ├── jax-projects/
│           │   │   │   ├── HOW_TO_PROPOSE_PROJECT.md
│           │   │   │   ├── README.md
│           │   │   │   ├── big_bird/
│           │   │   │   │   ├── README.md
│           │   │   │   │   ├── bigbird_flax.py
│           │   │   │   │   ├── evaluate.py
│           │   │   │   │   ├── prepare_natural_questions.py
│           │   │   │   │   ├── requirements.txt
│           │   │   │   │   ├── sweep_flax.yaml
│           │   │   │   │   └── train.py
│           │   │   │   ├── dataset-streaming/
│           │   │   │   │   ├── README.md
│           │   │   │   │   └── run_mlm_flax_stream.py
│           │   │   │   ├── hybrid_clip/
│           │   │   │   │   ├── README.md
│           │   │   │   │   ├── configuration_hybrid_clip.py
│           │   │   │   │   ├── modeling_hybrid_clip.py
│           │   │   │   │   ├── requirements.txt
│           │   │   │   │   └── run_hybrid_clip.py
│           │   │   │   ├── model_parallel/
│           │   │   │   │   ├── README.md
│           │   │   │   │   ├── partitions.py
│           │   │   │   │   └── run_clm_mp.py
│           │   │   │   └── wav2vec2/
│           │   │   │       ├── README.md
│           │   │   │       └── run_wav2vec2_pretrain_flax.py
│           │   │   ├── layoutlmv3/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   └── run_funsd_cord.py
│           │   │   ├── longform-qa/
│           │   │   │   ├── README.md
│           │   │   │   ├── eli5_app.py
│           │   │   │   ├── eli5_utils.py
│           │   │   │   └── requirements.txt
│           │   │   ├── luke/
│           │   │   │   ├── README.md
│           │   │   │   ├── luke_utils.py
│           │   │   │   └── run_luke_ner_no_trainer.py
│           │   │   ├── lxmert/
│           │   │   │   ├── README.md
│           │   │   │   ├── demo.ipynb
│           │   │   │   ├── extracting_data.py
│           │   │   │   ├── modeling_frcnn.py
│           │   │   │   ├── processing_image.py
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── utils.py
│           │   │   │   └── visualizing_image.py
│           │   │   ├── mlm_wwm/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_chinese_ref.py
│           │   │   │   └── run_mlm_wwm.py
│           │   │   ├── mm-imdb/
│           │   │   │   ├── README.md
│           │   │   │   ├── run_mmimdb.py
│           │   │   │   └── utils_mmimdb.py
│           │   │   ├── movement-pruning/
│           │   │   │   ├── README.md
│           │   │   │   ├── Saving_PruneBERT.ipynb
│           │   │   │   ├── bertarize.py
│           │   │   │   ├── counts_parameters.py
│           │   │   │   ├── emmental/
│           │   │   │   │   ├── __init__.py
│           │   │   │   │   ├── configuration_bert_masked.py
│           │   │   │   │   ├── modeling_bert_masked.py
│           │   │   │   │   └── modules/
│           │   │   │   │       ├── __init__.py
│           │   │   │   │       ├── binarizer.py
│           │   │   │   │       └── masked_nn.py
│           │   │   │   ├── masked_run_glue.py
│           │   │   │   ├── masked_run_squad.py
│           │   │   │   └── requirements.txt
│           │   │   ├── onnx/
│           │   │   │   └── summarization/
│           │   │   │       ├── README.md
│           │   │   │       ├── bart_onnx/
│           │   │   │       │   ├── generation_onnx.py
│           │   │   │       │   └── reduce_onnx_size.py
│           │   │   │       ├── requirements.txt
│           │   │   │       └── run_onnx_exporter.py
│           │   │   ├── performer/
│           │   │   │   ├── README.md
│           │   │   │   ├── full_script.sh
│           │   │   │   ├── modeling_flax_performer.py
│           │   │   │   ├── modeling_flax_performer_utils.py
│           │   │   │   ├── run_mlm_performer.py
│           │   │   │   └── sanity_script.sh
│           │   │   ├── pplm/
│           │   │   │   ├── README.md
│           │   │   │   ├── pplm_classification_head.py
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_pplm.py
│           │   │   │   └── run_pplm_discrim_train.py
│           │   │   ├── quantization-qdqbert/
│           │   │   │   ├── Dockerfile
│           │   │   │   ├── README.md
│           │   │   │   ├── evaluate-hf-trt-qa.py
│           │   │   │   ├── ort-infer-benchmark.py
│           │   │   │   ├── quant_trainer.py
│           │   │   │   ├── run_quant_qa.py
│           │   │   │   ├── trainer_quant_qa.py
│           │   │   │   └── utils_qa.py
│           │   │   ├── rag/
│           │   │   │   ├── README.md
│           │   │   │   ├── __init__.py
│           │   │   │   ├── _test_finetune_rag.py
│           │   │   │   ├── callbacks_rag.py
│           │   │   │   ├── consolidate_rag_checkpoint.py
│           │   │   │   ├── distributed_pytorch_retriever.py
│           │   │   │   ├── distributed_ray_retriever.py
│           │   │   │   ├── eval_rag.py
│           │   │   │   ├── finetune_rag.py
│           │   │   │   ├── finetune_rag.sh
│           │   │   │   ├── finetune_rag_ray.sh
│           │   │   │   ├── lightning_base.py
│           │   │   │   ├── parse_dpr_relevance_data.py
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── test_data/
│           │   │   │   │   └── my_knowledge_dataset.csv
│           │   │   │   ├── test_distributed_retriever.py
│           │   │   │   ├── use_own_knowledge_dataset.py
│           │   │   │   └── utils_rag.py
│           │   │   ├── rag-end2end-retriever/
│           │   │   │   ├── README.md
│           │   │   │   ├── callbacks_rag.py
│           │   │   │   ├── distributed_ray_retriever.py
│           │   │   │   ├── eval_rag.py
│           │   │   │   ├── finetune_rag.py
│           │   │   │   ├── finetune_rag_ray_end2end.sh
│           │   │   │   ├── kb_encode_utils.py
│           │   │   │   ├── lightning_base.py
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── test_run/
│           │   │   │   │   ├── dummy-kb/
│           │   │   │   │   │   └── my_knowledge_dataset.csv
│           │   │   │   │   ├── dummy-train-data/
│           │   │   │   │   │   ├── train.source
│           │   │   │   │   │   ├── train.target
│           │   │   │   │   │   ├── val.source
│           │   │   │   │   │   └── val.target
│           │   │   │   │   ├── test_finetune.sh
│           │   │   │   │   └── test_rag_new_features.sh
│           │   │   │   ├── use_own_knowledge_dataset.py
│           │   │   │   └── utils_rag.py
│           │   │   ├── robust-speech-event/
│           │   │   │   ├── README.md
│           │   │   │   ├── eval.py
│           │   │   │   ├── run_speech_recognition_ctc_bnb.py
│           │   │   │   └── run_speech_recognition_ctc_streaming.py
│           │   │   ├── self-training-text-classification/
│           │   │   │   ├── README.md
│           │   │   │   ├── finetuning.py
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run.sh
│           │   │   │   └── selftraining.py
│           │   │   ├── seq2seq-distillation/
│           │   │   │   ├── README.md
│           │   │   │   ├── _test_bash_script.py
│           │   │   │   ├── _test_make_student.py
│           │   │   │   ├── _test_seq2seq_examples.py
│           │   │   │   ├── _test_seq2seq_examples_multi_gpu.py
│           │   │   │   ├── callbacks.py
│           │   │   │   ├── convert_pl_checkpoint_to_hf.py
│           │   │   │   ├── distil_marian_enro_teacher.sh
│           │   │   │   ├── distil_marian_no_teacher.sh
│           │   │   │   ├── distillation.py
│           │   │   │   ├── dynamic_bs_example.sh
│           │   │   │   ├── finetune.py
│           │   │   │   ├── finetune.sh
│           │   │   │   ├── finetune_bart_tiny.sh
│           │   │   │   ├── finetune_pegasus_xsum.sh
│           │   │   │   ├── finetune_t5.sh
│           │   │   │   ├── lightning_base.py
│           │   │   │   ├── make_student.py
│           │   │   │   ├── precomputed_pseudo_labels.md
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_eval.py
│           │   │   │   ├── sentence_splitter.py
│           │   │   │   ├── train_distilbart_cnn.sh
│           │   │   │   ├── train_distilbart_xsum.sh
│           │   │   │   ├── train_mbart_cc25_enro.sh
│           │   │   │   └── utils.py
│           │   │   ├── tapex/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_tabfact_with_tapex.py
│           │   │   │   ├── run_wikisql_with_tapex.py
│           │   │   │   ├── run_wikitablequestions_with_tapex.py
│           │   │   │   └── wikisql_utils.py
│           │   │   ├── visual_bert/
│           │   │   │   ├── README.md
│           │   │   │   ├── demo.ipynb
│           │   │   │   ├── extracting_data.py
│           │   │   │   ├── modeling_frcnn.py
│           │   │   │   ├── processing_image.py
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── utils.py
│           │   │   │   └── visualizing_image.py
│           │   │   ├── wav2vec2/
│           │   │   │   ├── FINE_TUNE_XLSR_WAV2VEC2.md
│           │   │   │   ├── README.md
│           │   │   │   ├── alignment.py
│           │   │   │   ├── ds_config_wav2vec2_zero2.json
│           │   │   │   ├── ds_config_wav2vec2_zero3.json
│           │   │   │   ├── finetune_base_100.sh
│           │   │   │   ├── finetune_base_timit_asr.sh
│           │   │   │   ├── finetune_large_lv60_100.sh
│           │   │   │   ├── finetune_large_lv60_timit_asr.sh
│           │   │   │   ├── finetune_large_xlsr_53_arabic_speech_corpus.sh
│           │   │   │   ├── finetune_wav2vec2_xlsr_turkish.sh
│           │   │   │   ├── requirements.txt
│           │   │   │   ├── run_alignment.sh
│           │   │   │   ├── run_asr.py
│           │   │   │   ├── run_common_voice.py
│           │   │   │   ├── run_pretrain.py
│           │   │   │   ├── test_wav2vec2_deepspeed.py
│           │   │   │   └── vocab/
│           │   │   │       └── buckwalter.json
│           │   │   ├── xtreme-s/
│           │   │   │   ├── README.md
│           │   │   │   ├── requirements.txt
│           │   │   │   └── run_xtreme_s.py
│           │   │   └── zero-shot-distillation/
│           │   │       ├── README.md
│           │   │       └── distill_classifier.py
│           │   └── tensorflow/
│           │       ├── README.md
│           │       ├── _tests_requirements.txt
│           │       ├── benchmarking/
│           │       │   ├── README.md
│           │       │   ├── plot_csv_file.py
│           │       │   ├── requirements.txt
│           │       │   └── run_benchmark_tf.py
│           │       ├── language-modeling/
│           │       │   ├── README.md
│           │       │   ├── requirements.txt
│           │       │   ├── run_clm.py
│           │       │   └── run_mlm.py
│           │       ├── multiple-choice/
│           │       │   ├── README.md
│           │       │   ├── requirements.txt
│           │       │   └── run_swag.py
│           │       ├── question-answering/
│           │       │   ├── README.md
│           │       │   ├── requirements.txt
│           │       │   ├── run_qa.py
│           │       │   └── utils_qa.py
│           │       ├── summarization/
│           │       │   ├── README.md
│           │       │   ├── requirements.txt
│           │       │   └── run_summarization.py
│           │       ├── test_tensorflow_examples.py
│           │       ├── text-classification/
│           │       │   ├── README.md
│           │       │   ├── requirements.txt
│           │       │   ├── run_glue.py
│           │       │   └── run_text_classification.py
│           │       ├── token-classification/
│           │       │   ├── README.md
│           │       │   ├── requirements.txt
│           │       │   └── run_ner.py
│           │       └── translation/
│           │           ├── README.md
│           │           ├── requirements.txt
│           │           └── run_translation.py
│           ├── hubconf.py
│           ├── model_cards/
│           │   └── README.md
│           ├── notebooks/
│           │   └── README.md
│           ├── pyproject.toml
│           ├── scripts/
│           │   ├── benchmark/
│           │   │   └── trainer-benchmark.py
│           │   ├── check_tokenizers.py
│           │   ├── distributed/
│           │   │   └── torch-distributed-gpu-test.py
│           │   ├── fsmt/
│           │   │   ├── convert-allenai-wmt16.sh
│           │   │   ├── convert-allenai-wmt19.sh
│           │   │   ├── convert-facebook-wmt19.sh
│           │   │   ├── eval-allenai-wmt16.sh
│           │   │   ├── eval-allenai-wmt19.sh
│           │   │   ├── eval-facebook-wmt19.sh
│           │   │   ├── fsmt-make-super-tiny-model.py
│           │   │   ├── fsmt-make-tiny-model.py
│           │   │   ├── gen-card-allenai-wmt16.py
│           │   │   ├── gen-card-allenai-wmt19.py
│           │   │   ├── gen-card-facebook-wmt19.py
│           │   │   ├── s3-move.sh
│           │   │   └── tests-to-run.sh
│           │   ├── pegasus/
│           │   │   └── build_test_sample_spm_no_bos.py
│           │   ├── stale.py
│           │   └── tatoeba/
│           │       ├── README.md
│           │       └── upload_models.sh
│           ├── setup.cfg
│           ├── setup.py
│           ├── src/
│           │   └── transformers/
│           │       ├── __init__.py
│           │       ├── activations.py
│           │       ├── activations_tf.py
│           │       ├── benchmark/
│           │       │   ├── __init__.py
│           │       │   ├── benchmark.py
│           │       │   ├── benchmark_args.py
│           │       │   ├── benchmark_args_tf.py
│           │       │   ├── benchmark_args_utils.py
│           │       │   ├── benchmark_tf.py
│           │       │   └── benchmark_utils.py
│           │       ├── commands/
│           │       │   ├── __init__.py
│           │       │   ├── add_new_model.py
│           │       │   ├── add_new_model_like.py
│           │       │   ├── convert.py
│           │       │   ├── download.py
│           │       │   ├── env.py
│           │       │   ├── lfs.py
│           │       │   ├── pt_to_tf.py
│           │       │   ├── run.py
│           │       │   ├── serving.py
│           │       │   ├── train.py
│           │       │   ├── transformers_cli.py
│           │       │   └── user.py
│           │       ├── configuration_utils.py
│           │       ├── convert_graph_to_onnx.py
│           │       ├── convert_pytorch_checkpoint_to_tf2.py
│           │       ├── convert_slow_tokenizer.py
│           │       ├── convert_slow_tokenizers_checkpoints_to_fast.py
│           │       ├── convert_tf_hub_seq_to_seq_bert_to_pytorch.py
│           │       ├── data/
│           │       │   ├── __init__.py
│           │       │   ├── data_collator.py
│           │       │   ├── datasets/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── glue.py
│           │       │   │   ├── language_modeling.py
│           │       │   │   └── squad.py
│           │       │   ├── metrics/
│           │       │   │   ├── __init__.py
│           │       │   │   └── squad_metrics.py
│           │       │   ├── processors/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── glue.py
│           │       │   │   ├── squad.py
│           │       │   │   ├── utils.py
│           │       │   │   └── xnli.py
│           │       │   └── test_generation_utils.py
│           │       ├── debug_utils.py
│           │       ├── deepspeed.py
│           │       ├── dependency_versions_check.py
│           │       ├── dependency_versions_table.py
│           │       ├── dynamic_module_utils.py
│           │       ├── feature_extraction_sequence_utils.py
│           │       ├── feature_extraction_utils.py
│           │       ├── file_utils.py
│           │       ├── generation_beam_constraints.py
│           │       ├── generation_beam_search.py
│           │       ├── generation_flax_logits_process.py
│           │       ├── generation_flax_utils.py
│           │       ├── generation_logits_process.py
│           │       ├── generation_stopping_criteria.py
│           │       ├── generation_tf_logits_process.py
│           │       ├── generation_tf_utils.py
│           │       ├── generation_utils.py
│           │       ├── hf_argparser.py
│           │       ├── image_processing_utils.py
│           │       ├── image_transforms.py
│           │       ├── image_utils.py
│           │       ├── integrations.py
│           │       ├── keras_callbacks.py
│           │       ├── modelcard.py
│           │       ├── modeling_flax_outputs.py
│           │       ├── modeling_flax_pytorch_utils.py
│           │       ├── modeling_flax_utils.py
│           │       ├── modeling_outputs.py
│           │       ├── modeling_tf_outputs.py
│           │       ├── modeling_tf_pytorch_utils.py
│           │       ├── modeling_tf_utils.py
│           │       ├── modeling_utils.py
│           │       ├── models/
│           │       │   ├── __init__.py
│           │       │   ├── albert/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_albert.py
│           │       │   │   ├── convert_albert_original_tf_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_albert.py
│           │       │   │   ├── modeling_flax_albert.py
│           │       │   │   ├── modeling_tf_albert.py
│           │       │   │   ├── tokenization_albert.py
│           │       │   │   └── tokenization_albert_fast.py
│           │       │   ├── auto/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── auto_factory.py
│           │       │   │   ├── configuration_auto.py
│           │       │   │   ├── feature_extraction_auto.py
│           │       │   │   ├── modeling_auto.py
│           │       │   │   ├── modeling_flax_auto.py
│           │       │   │   ├── modeling_tf_auto.py
│           │       │   │   ├── processing_auto.py
│           │       │   │   └── tokenization_auto.py
│           │       │   ├── bart/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_bart.py
│           │       │   │   ├── convert_bart_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_bart.py
│           │       │   │   ├── modeling_flax_bart.py
│           │       │   │   ├── modeling_tf_bart.py
│           │       │   │   ├── tokenization_bart.py
│           │       │   │   └── tokenization_bart_fast.py
│           │       │   ├── barthez/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── tokenization_barthez.py
│           │       │   │   └── tokenization_barthez_fast.py
│           │       │   ├── bartpho/
│           │       │   │   ├── __init__.py
│           │       │   │   └── tokenization_bartpho.py
│           │       │   ├── beit/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_beit.py
│           │       │   │   ├── convert_beit_unilm_to_pytorch.py
│           │       │   │   ├── feature_extraction_beit.py
│           │       │   │   ├── modeling_beit.py
│           │       │   │   └── modeling_flax_beit.py
│           │       │   ├── bert/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_bert.py
│           │       │   │   ├── convert_bert_original_tf2_checkpoint_to_pytorch.py
│           │       │   │   ├── convert_bert_original_tf_checkpoint_to_pytorch.py
│           │       │   │   ├── convert_bert_pytorch_checkpoint_to_original_tf.py
│           │       │   │   ├── convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_bert.py
│           │       │   │   ├── modeling_flax_bert.py
│           │       │   │   ├── modeling_tf_bert.py
│           │       │   │   ├── tokenization_bert.py
│           │       │   │   ├── tokenization_bert_fast.py
│           │       │   │   └── tokenization_bert_tf.py
│           │       │   ├── bert_generation/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_bert_generation.py
│           │       │   │   ├── modeling_bert_generation.py
│           │       │   │   └── tokenization_bert_generation.py
│           │       │   ├── bert_japanese/
│           │       │   │   ├── __init__.py
│           │       │   │   └── tokenization_bert_japanese.py
│           │       │   ├── bertweet/
│           │       │   │   ├── __init__.py
│           │       │   │   └── tokenization_bertweet.py
│           │       │   ├── big_bird/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_big_bird.py
│           │       │   │   ├── convert_bigbird_original_tf_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_big_bird.py
│           │       │   │   ├── modeling_flax_big_bird.py
│           │       │   │   ├── tokenization_big_bird.py
│           │       │   │   └── tokenization_big_bird_fast.py
│           │       │   ├── bigbird_pegasus/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_bigbird_pegasus.py
│           │       │   │   ├── convert_bigbird_pegasus_tf_to_pytorch.py
│           │       │   │   └── modeling_bigbird_pegasus.py
│           │       │   ├── blenderbot/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_blenderbot.py
│           │       │   │   ├── convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_blenderbot.py
│           │       │   │   ├── modeling_flax_blenderbot.py
│           │       │   │   ├── modeling_tf_blenderbot.py
│           │       │   │   ├── tokenization_blenderbot.py
│           │       │   │   └── tokenization_blenderbot_fast.py
│           │       │   ├── blenderbot_small/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_blenderbot_small.py
│           │       │   │   ├── modeling_blenderbot_small.py
│           │       │   │   ├── modeling_flax_blenderbot_small.py
│           │       │   │   ├── modeling_tf_blenderbot_small.py
│           │       │   │   ├── tokenization_blenderbot_small.py
│           │       │   │   └── tokenization_blenderbot_small_fast.py
│           │       │   ├── bloom/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_bloom.py
│           │       │   │   ├── convert_bloom_original_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_bloom.py
│           │       │   │   └── tokenization_bloom_fast.py
│           │       │   ├── bort/
│           │       │   │   ├── __init__.py
│           │       │   │   └── convert_bort_original_gluonnlp_checkpoint_to_pytorch.py
│           │       │   ├── byt5/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── convert_byt5_original_tf_checkpoint_to_pytorch.py
│           │       │   │   └── tokenization_byt5.py
│           │       │   ├── camembert/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_camembert.py
│           │       │   │   ├── modeling_camembert.py
│           │       │   │   ├── modeling_tf_camembert.py
│           │       │   │   ├── tokenization_camembert.py
│           │       │   │   └── tokenization_camembert_fast.py
│           │       │   ├── canine/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_canine.py
│           │       │   │   ├── convert_canine_original_tf_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_canine.py
│           │       │   │   └── tokenization_canine.py
│           │       │   ├── clip/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_clip.py
│           │       │   │   ├── convert_clip_original_pytorch_to_hf.py
│           │       │   │   ├── feature_extraction_clip.py
│           │       │   │   ├── modeling_clip.py
│           │       │   │   ├── modeling_flax_clip.py
│           │       │   │   ├── modeling_tf_clip.py
│           │       │   │   ├── processing_clip.py
│           │       │   │   ├── tokenization_clip.py
│           │       │   │   └── tokenization_clip_fast.py
│           │       │   ├── codegen/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_codegen.py
│           │       │   │   ├── modeling_codegen.py
│           │       │   │   ├── tokenization_codegen.py
│           │       │   │   └── tokenization_codegen_fast.py
│           │       │   ├── conditional_detr/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_conditional_detr.py
│           │       │   │   ├── convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   ├── feature_extraction_conditional_detr.py
│           │       │   │   └── modeling_conditional_detr.py
│           │       │   ├── convbert/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_convbert.py
│           │       │   │   ├── convert_convbert_original_tf1_checkpoint_to_pytorch_and_tf2.py
│           │       │   │   ├── modeling_convbert.py
│           │       │   │   ├── modeling_tf_convbert.py
│           │       │   │   ├── tokenization_convbert.py
│           │       │   │   └── tokenization_convbert_fast.py
│           │       │   ├── convnext/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_convnext.py
│           │       │   │   ├── convert_convnext_to_pytorch.py
│           │       │   │   ├── feature_extraction_convnext.py
│           │       │   │   ├── modeling_convnext.py
│           │       │   │   └── modeling_tf_convnext.py
│           │       │   ├── cpm/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── tokenization_cpm.py
│           │       │   │   └── tokenization_cpm_fast.py
│           │       │   ├── ctrl/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_ctrl.py
│           │       │   │   ├── modeling_ctrl.py
│           │       │   │   ├── modeling_tf_ctrl.py
│           │       │   │   └── tokenization_ctrl.py
│           │       │   ├── cvt/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_cvt.py
│           │       │   │   ├── convert_cvt_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_cvt.py
│           │       │   │   └── modeling_tf_cvt.py
│           │       │   ├── data2vec/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_data2vec_audio.py
│           │       │   │   ├── configuration_data2vec_text.py
│           │       │   │   ├── configuration_data2vec_vision.py
│           │       │   │   ├── convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   ├── convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   ├── convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_data2vec_audio.py
│           │       │   │   ├── modeling_data2vec_text.py
│           │       │   │   ├── modeling_data2vec_vision.py
│           │       │   │   └── modeling_tf_data2vec_vision.py
│           │       │   ├── deberta/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_deberta.py
│           │       │   │   ├── modeling_deberta.py
│           │       │   │   ├── modeling_tf_deberta.py
│           │       │   │   ├── tokenization_deberta.py
│           │       │   │   └── tokenization_deberta_fast.py
│           │       │   ├── deberta_v2/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_deberta_v2.py
│           │       │   │   ├── modeling_deberta_v2.py
│           │       │   │   ├── modeling_tf_deberta_v2.py
│           │       │   │   ├── tokenization_deberta_v2.py
│           │       │   │   └── tokenization_deberta_v2_fast.py
│           │       │   ├── decision_transformer/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_decision_transformer.py
│           │       │   │   └── modeling_decision_transformer.py
│           │       │   ├── deformable_detr/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_deformable_detr.py
│           │       │   │   ├── convert_deformable_detr_to_pytorch.py
│           │       │   │   ├── custom_kernel/
│           │       │   │   │   ├── cpu/
│           │       │   │   │   │   ├── ms_deform_attn_cpu.cpp
│           │       │   │   │   │   └── ms_deform_attn_cpu.h
│           │       │   │   │   ├── cuda/
│           │       │   │   │   │   ├── ms_deform_attn_cuda.cu
│           │       │   │   │   │   ├── ms_deform_attn_cuda.cuh
│           │       │   │   │   │   ├── ms_deform_attn_cuda.h
│           │       │   │   │   │   └── ms_deform_im2col_cuda.cuh
│           │       │   │   │   ├── ms_deform_attn.h
│           │       │   │   │   └── vision.cpp
│           │       │   │   ├── feature_extraction_deformable_detr.py
│           │       │   │   ├── load_custom.py
│           │       │   │   └── modeling_deformable_detr.py
│           │       │   ├── deit/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_deit.py
│           │       │   │   ├── convert_deit_timm_to_pytorch.py
│           │       │   │   ├── feature_extraction_deit.py
│           │       │   │   ├── modeling_deit.py
│           │       │   │   └── modeling_tf_deit.py
│           │       │   ├── detr/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_detr.py
│           │       │   │   ├── convert_detr_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   ├── feature_extraction_detr.py
│           │       │   │   └── modeling_detr.py
│           │       │   ├── dialogpt/
│           │       │   │   ├── __init__.py
│           │       │   │   └── convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py
│           │       │   ├── distilbert/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_distilbert.py
│           │       │   │   ├── modeling_distilbert.py
│           │       │   │   ├── modeling_flax_distilbert.py
│           │       │   │   ├── modeling_tf_distilbert.py
│           │       │   │   ├── tokenization_distilbert.py
│           │       │   │   └── tokenization_distilbert_fast.py
│           │       │   ├── dit/
│           │       │   │   ├── __init__.py
│           │       │   │   └── convert_dit_unilm_to_pytorch.py
│           │       │   ├── donut/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_donut_swin.py
│           │       │   │   ├── convert_donut_to_pytorch.py
│           │       │   │   ├── feature_extraction_donut.py
│           │       │   │   ├── modeling_donut_swin.py
│           │       │   │   └── processing_donut.py
│           │       │   ├── dpr/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_dpr.py
│           │       │   │   ├── convert_dpr_original_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_dpr.py
│           │       │   │   ├── modeling_tf_dpr.py
│           │       │   │   ├── tokenization_dpr.py
│           │       │   │   └── tokenization_dpr_fast.py
│           │       │   ├── dpt/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_dpt.py
│           │       │   │   ├── convert_dpt_to_pytorch.py
│           │       │   │   ├── feature_extraction_dpt.py
│           │       │   │   └── modeling_dpt.py
│           │       │   ├── electra/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_electra.py
│           │       │   │   ├── convert_electra_original_tf_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_electra.py
│           │       │   │   ├── modeling_flax_electra.py
│           │       │   │   ├── modeling_tf_electra.py
│           │       │   │   ├── tokenization_electra.py
│           │       │   │   └── tokenization_electra_fast.py
│           │       │   ├── encoder_decoder/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_encoder_decoder.py
│           │       │   │   ├── modeling_encoder_decoder.py
│           │       │   │   ├── modeling_flax_encoder_decoder.py
│           │       │   │   └── modeling_tf_encoder_decoder.py
│           │       │   ├── ernie/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_ernie.py
│           │       │   │   └── modeling_ernie.py
│           │       │   ├── esm/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_esm.py
│           │       │   │   ├── convert_esm.py
│           │       │   │   ├── modeling_esm.py
│           │       │   │   ├── modeling_esmfold.py
│           │       │   │   ├── modeling_tf_esm.py
│           │       │   │   ├── openfold_utils/
│           │       │   │   │   ├── __init__.py
│           │       │   │   │   ├── chunk_utils.py
│           │       │   │   │   ├── data_transforms.py
│           │       │   │   │   ├── feats.py
│           │       │   │   │   ├── loss.py
│           │       │   │   │   ├── protein.py
│           │       │   │   │   ├── residue_constants.py
│           │       │   │   │   ├── rigid_utils.py
│           │       │   │   │   └── tensor_utils.py
│           │       │   │   └── tokenization_esm.py
│           │       │   ├── flaubert/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_flaubert.py
│           │       │   │   ├── modeling_flaubert.py
│           │       │   │   ├── modeling_tf_flaubert.py
│           │       │   │   └── tokenization_flaubert.py
│           │       │   ├── flava/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_flava.py
│           │       │   │   ├── convert_dalle_to_flava_codebook.py
│           │       │   │   ├── convert_flava_original_pytorch_to_hf.py
│           │       │   │   ├── feature_extraction_flava.py
│           │       │   │   ├── modeling_flava.py
│           │       │   │   └── processing_flava.py
│           │       │   ├── fnet/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_fnet.py
│           │       │   │   ├── convert_fnet_original_flax_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_fnet.py
│           │       │   │   ├── tokenization_fnet.py
│           │       │   │   └── tokenization_fnet_fast.py
│           │       │   ├── fsmt/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_fsmt.py
│           │       │   │   ├── convert_fsmt_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_fsmt.py
│           │       │   │   └── tokenization_fsmt.py
│           │       │   ├── funnel/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_funnel.py
│           │       │   │   ├── convert_funnel_original_tf_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_funnel.py
│           │       │   │   ├── modeling_tf_funnel.py
│           │       │   │   ├── tokenization_funnel.py
│           │       │   │   └── tokenization_funnel_fast.py
│           │       │   ├── glpn/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_glpn.py
│           │       │   │   ├── convert_glpn_to_pytorch.py
│           │       │   │   ├── feature_extraction_glpn.py
│           │       │   │   ├── image_processing_glpn.py
│           │       │   │   └── modeling_glpn.py
│           │       │   ├── gpt2/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_gpt2.py
│           │       │   │   ├── convert_gpt2_original_tf_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_flax_gpt2.py
│           │       │   │   ├── modeling_gpt2.py
│           │       │   │   ├── modeling_tf_gpt2.py
│           │       │   │   ├── tokenization_gpt2.py
│           │       │   │   └── tokenization_gpt2_fast.py
│           │       │   ├── gpt_neo/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_gpt_neo.py
│           │       │   │   ├── convert_gpt_neo_mesh_tf_to_pytorch.py
│           │       │   │   ├── modeling_flax_gpt_neo.py
│           │       │   │   └── modeling_gpt_neo.py
│           │       │   ├── gpt_neox/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_gpt_neox.py
│           │       │   │   ├── modeling_gpt_neox.py
│           │       │   │   └── tokenization_gpt_neox_fast.py
│           │       │   ├── gpt_neox_japanese/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_gpt_neox_japanese.py
│           │       │   │   ├── modeling_gpt_neox_japanese.py
│           │       │   │   └── tokenization_gpt_neox_japanese.py
│           │       │   ├── gptj/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_gptj.py
│           │       │   │   ├── modeling_flax_gptj.py
│           │       │   │   ├── modeling_gptj.py
│           │       │   │   └── modeling_tf_gptj.py
│           │       │   ├── groupvit/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_groupvit.py
│           │       │   │   ├── convert_groupvit_nvlab_to_hf.py
│           │       │   │   ├── modeling_groupvit.py
│           │       │   │   └── modeling_tf_groupvit.py
│           │       │   ├── herbert/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── tokenization_herbert.py
│           │       │   │   └── tokenization_herbert_fast.py
│           │       │   ├── hubert/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_hubert.py
│           │       │   │   ├── convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py
│           │       │   │   ├── convert_hubert_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   ├── convert_hubert_original_s3prl_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_hubert.py
│           │       │   │   └── modeling_tf_hubert.py
│           │       │   ├── ibert/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_ibert.py
│           │       │   │   ├── modeling_ibert.py
│           │       │   │   └── quant_modules.py
│           │       │   ├── imagegpt/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_imagegpt.py
│           │       │   │   ├── convert_imagegpt_original_tf2_to_pytorch.py
│           │       │   │   ├── feature_extraction_imagegpt.py
│           │       │   │   └── modeling_imagegpt.py
│           │       │   ├── layoutlm/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_layoutlm.py
│           │       │   │   ├── modeling_layoutlm.py
│           │       │   │   ├── modeling_tf_layoutlm.py
│           │       │   │   ├── tokenization_layoutlm.py
│           │       │   │   └── tokenization_layoutlm_fast.py
│           │       │   ├── layoutlmv2/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_layoutlmv2.py
│           │       │   │   ├── feature_extraction_layoutlmv2.py
│           │       │   │   ├── modeling_layoutlmv2.py
│           │       │   │   ├── processing_layoutlmv2.py
│           │       │   │   ├── tokenization_layoutlmv2.py
│           │       │   │   └── tokenization_layoutlmv2_fast.py
│           │       │   ├── layoutlmv3/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_layoutlmv3.py
│           │       │   │   ├── feature_extraction_layoutlmv3.py
│           │       │   │   ├── modeling_layoutlmv3.py
│           │       │   │   ├── modeling_tf_layoutlmv3.py
│           │       │   │   ├── processing_layoutlmv3.py
│           │       │   │   ├── tokenization_layoutlmv3.py
│           │       │   │   └── tokenization_layoutlmv3_fast.py
│           │       │   ├── layoutxlm/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── processing_layoutxlm.py
│           │       │   │   ├── tokenization_layoutxlm.py
│           │       │   │   └── tokenization_layoutxlm_fast.py
│           │       │   ├── led/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_led.py
│           │       │   │   ├── modeling_led.py
│           │       │   │   ├── modeling_tf_led.py
│           │       │   │   ├── tokenization_led.py
│           │       │   │   └── tokenization_led_fast.py
│           │       │   ├── levit/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_levit.py
│           │       │   │   ├── convert_levit_timm_to_pytorch.py
│           │       │   │   ├── feature_extraction_levit.py
│           │       │   │   └── modeling_levit.py
│           │       │   ├── lilt/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_lilt.py
│           │       │   │   └── modeling_lilt.py
│           │       │   ├── longformer/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_longformer.py
│           │       │   │   ├── convert_longformer_original_pytorch_lightning_to_pytorch.py
│           │       │   │   ├── modeling_longformer.py
│           │       │   │   ├── modeling_tf_longformer.py
│           │       │   │   ├── tokenization_longformer.py
│           │       │   │   └── tokenization_longformer_fast.py
│           │       │   ├── longt5/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_longt5.py
│           │       │   │   ├── convert_longt5x_checkpoint_to_flax.py
│           │       │   │   ├── modeling_flax_longt5.py
│           │       │   │   └── modeling_longt5.py
│           │       │   ├── luke/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_luke.py
│           │       │   │   ├── convert_luke_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_luke.py
│           │       │   │   └── tokenization_luke.py
│           │       │   ├── lxmert/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_lxmert.py
│           │       │   │   ├── convert_lxmert_original_tf_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_lxmert.py
│           │       │   │   ├── modeling_tf_lxmert.py
│           │       │   │   ├── tokenization_lxmert.py
│           │       │   │   └── tokenization_lxmert_fast.py
│           │       │   ├── m2m_100/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_m2m_100.py
│           │       │   │   ├── convert_m2m100_original_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_m2m_100.py
│           │       │   │   └── tokenization_m2m_100.py
│           │       │   ├── marian/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_marian.py
│           │       │   │   ├── convert_marian_tatoeba_to_pytorch.py
│           │       │   │   ├── convert_marian_to_pytorch.py
│           │       │   │   ├── modeling_flax_marian.py
│           │       │   │   ├── modeling_marian.py
│           │       │   │   ├── modeling_tf_marian.py
│           │       │   │   └── tokenization_marian.py
│           │       │   ├── markuplm/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_markuplm.py
│           │       │   │   ├── feature_extraction_markuplm.py
│           │       │   │   ├── modeling_markuplm.py
│           │       │   │   ├── processing_markuplm.py
│           │       │   │   ├── tokenization_markuplm.py
│           │       │   │   └── tokenization_markuplm_fast.py
│           │       │   ├── maskformer/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_maskformer.py
│           │       │   │   ├── convert_maskformer_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   ├── feature_extraction_maskformer.py
│           │       │   │   └── modeling_maskformer.py
│           │       │   ├── mbart/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_mbart.py
│           │       │   │   ├── convert_mbart_original_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_flax_mbart.py
│           │       │   │   ├── modeling_mbart.py
│           │       │   │   ├── modeling_tf_mbart.py
│           │       │   │   ├── tokenization_mbart.py
│           │       │   │   └── tokenization_mbart_fast.py
│           │       │   ├── mbart50/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── tokenization_mbart50.py
│           │       │   │   └── tokenization_mbart50_fast.py
│           │       │   ├── mctct/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_mctct.py
│           │       │   │   ├── feature_extraction_mctct.py
│           │       │   │   ├── modeling_mctct.py
│           │       │   │   └── processing_mctct.py
│           │       │   ├── megatron_bert/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_megatron_bert.py
│           │       │   │   ├── convert_megatron_bert_checkpoint.py
│           │       │   │   └── modeling_megatron_bert.py
│           │       │   ├── megatron_gpt2/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── checkpoint_reshaping_and_interoperability.py
│           │       │   │   └── convert_megatron_gpt2_checkpoint.py
│           │       │   ├── mluke/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── convert_mluke_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   └── tokenization_mluke.py
│           │       │   ├── mmbt/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_mmbt.py
│           │       │   │   └── modeling_mmbt.py
│           │       │   ├── mobilebert/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_mobilebert.py
│           │       │   │   ├── convert_mobilebert_original_tf_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_mobilebert.py
│           │       │   │   ├── modeling_tf_mobilebert.py
│           │       │   │   ├── tokenization_mobilebert.py
│           │       │   │   └── tokenization_mobilebert_fast.py
│           │       │   ├── mobilevit/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_mobilevit.py
│           │       │   │   ├── convert_mlcvnets_to_pytorch.py
│           │       │   │   ├── feature_extraction_mobilevit.py
│           │       │   │   ├── modeling_mobilevit.py
│           │       │   │   └── modeling_tf_mobilevit.py
│           │       │   ├── mpnet/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_mpnet.py
│           │       │   │   ├── modeling_mpnet.py
│           │       │   │   ├── modeling_tf_mpnet.py
│           │       │   │   ├── tokenization_mpnet.py
│           │       │   │   └── tokenization_mpnet_fast.py
│           │       │   ├── mt5/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_mt5.py
│           │       │   │   ├── modeling_flax_mt5.py
│           │       │   │   ├── modeling_mt5.py
│           │       │   │   └── modeling_tf_mt5.py
│           │       │   ├── mvp/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_mvp.py
│           │       │   │   ├── modeling_mvp.py
│           │       │   │   ├── tokenization_mvp.py
│           │       │   │   └── tokenization_mvp_fast.py
│           │       │   ├── nezha/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_nezha.py
│           │       │   │   └── modeling_nezha.py
│           │       │   ├── nllb/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── tokenization_nllb.py
│           │       │   │   └── tokenization_nllb_fast.py
│           │       │   ├── nystromformer/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_nystromformer.py
│           │       │   │   ├── convert_nystromformer_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   └── modeling_nystromformer.py
│           │       │   ├── openai/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_openai.py
│           │       │   │   ├── convert_openai_original_tf_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_openai.py
│           │       │   │   ├── modeling_tf_openai.py
│           │       │   │   ├── tokenization_openai.py
│           │       │   │   └── tokenization_openai_fast.py
│           │       │   ├── opt/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_opt.py
│           │       │   │   ├── convert_opt_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_flax_opt.py
│           │       │   │   ├── modeling_opt.py
│           │       │   │   └── modeling_tf_opt.py
│           │       │   ├── owlvit/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_owlvit.py
│           │       │   │   ├── convert_owlvit_original_flax_to_hf.py
│           │       │   │   ├── feature_extraction_owlvit.py
│           │       │   │   ├── modeling_owlvit.py
│           │       │   │   └── processing_owlvit.py
│           │       │   ├── pegasus/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_pegasus.py
│           │       │   │   ├── convert_pegasus_tf_to_pytorch.py
│           │       │   │   ├── modeling_flax_pegasus.py
│           │       │   │   ├── modeling_pegasus.py
│           │       │   │   ├── modeling_tf_pegasus.py
│           │       │   │   ├── tokenization_pegasus.py
│           │       │   │   └── tokenization_pegasus_fast.py
│           │       │   ├── pegasus_x/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_pegasus_x.py
│           │       │   │   └── modeling_pegasus_x.py
│           │       │   ├── perceiver/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_perceiver.py
│           │       │   │   ├── convert_perceiver_haiku_to_pytorch.py
│           │       │   │   ├── feature_extraction_perceiver.py
│           │       │   │   ├── modeling_perceiver.py
│           │       │   │   └── tokenization_perceiver.py
│           │       │   ├── phobert/
│           │       │   │   ├── __init__.py
│           │       │   │   └── tokenization_phobert.py
│           │       │   ├── plbart/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_plbart.py
│           │       │   │   ├── convert_plbart_original_checkpoint_to_torch.py
│           │       │   │   ├── modeling_plbart.py
│           │       │   │   └── tokenization_plbart.py
│           │       │   ├── poolformer/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_poolformer.py
│           │       │   │   ├── convert_poolformer_original_to_pytorch.py
│           │       │   │   ├── feature_extraction_poolformer.py
│           │       │   │   └── modeling_poolformer.py
│           │       │   ├── prophetnet/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_prophetnet.py
│           │       │   │   ├── convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_prophetnet.py
│           │       │   │   └── tokenization_prophetnet.py
│           │       │   ├── qdqbert/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_qdqbert.py
│           │       │   │   └── modeling_qdqbert.py
│           │       │   ├── rag/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_rag.py
│           │       │   │   ├── modeling_rag.py
│           │       │   │   ├── modeling_tf_rag.py
│           │       │   │   ├── retrieval_rag.py
│           │       │   │   └── tokenization_rag.py
│           │       │   ├── realm/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_realm.py
│           │       │   │   ├── modeling_realm.py
│           │       │   │   ├── retrieval_realm.py
│           │       │   │   ├── tokenization_realm.py
│           │       │   │   └── tokenization_realm_fast.py
│           │       │   ├── reformer/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_reformer.py
│           │       │   │   ├── convert_reformer_trax_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_reformer.py
│           │       │   │   ├── tokenization_reformer.py
│           │       │   │   └── tokenization_reformer_fast.py
│           │       │   ├── regnet/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_regnet.py
│           │       │   │   ├── convert_regnet_seer_10b_to_pytorch.py
│           │       │   │   ├── convert_regnet_to_pytorch.py
│           │       │   │   ├── modeling_regnet.py
│           │       │   │   └── modeling_tf_regnet.py
│           │       │   ├── rembert/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_rembert.py
│           │       │   │   ├── convert_rembert_tf_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_rembert.py
│           │       │   │   ├── modeling_tf_rembert.py
│           │       │   │   ├── tokenization_rembert.py
│           │       │   │   └── tokenization_rembert_fast.py
│           │       │   ├── resnet/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_resnet.py
│           │       │   │   ├── convert_resnet_to_pytorch.py
│           │       │   │   ├── modeling_resnet.py
│           │       │   │   └── modeling_tf_resnet.py
│           │       │   ├── retribert/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_retribert.py
│           │       │   │   ├── modeling_retribert.py
│           │       │   │   ├── tokenization_retribert.py
│           │       │   │   └── tokenization_retribert_fast.py
│           │       │   ├── roberta/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_roberta.py
│           │       │   │   ├── convert_roberta_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_flax_roberta.py
│           │       │   │   ├── modeling_roberta.py
│           │       │   │   ├── modeling_tf_roberta.py
│           │       │   │   ├── tokenization_roberta.py
│           │       │   │   └── tokenization_roberta_fast.py
│           │       │   ├── roformer/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_roformer.py
│           │       │   │   ├── convert_roformer_original_tf_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_flax_roformer.py
│           │       │   │   ├── modeling_roformer.py
│           │       │   │   ├── modeling_tf_roformer.py
│           │       │   │   ├── tokenization_roformer.py
│           │       │   │   ├── tokenization_roformer_fast.py
│           │       │   │   └── tokenization_utils.py
│           │       │   ├── segformer/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_segformer.py
│           │       │   │   ├── convert_segformer_original_to_pytorch.py
│           │       │   │   ├── feature_extraction_segformer.py
│           │       │   │   ├── modeling_segformer.py
│           │       │   │   └── modeling_tf_segformer.py
│           │       │   ├── sew/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_sew.py
│           │       │   │   ├── convert_sew_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   └── modeling_sew.py
│           │       │   ├── sew_d/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_sew_d.py
│           │       │   │   ├── convert_sew_d_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   └── modeling_sew_d.py
│           │       │   ├── speech_encoder_decoder/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_speech_encoder_decoder.py
│           │       │   │   ├── convert_mbart_wav2vec2_seq2seq_original_to_pytorch.py
│           │       │   │   ├── convert_speech_to_text_wav2vec2_seq2seq_original_to_pytorch.py
│           │       │   │   ├── modeling_flax_speech_encoder_decoder.py
│           │       │   │   └── modeling_speech_encoder_decoder.py
│           │       │   ├── speech_to_text/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_speech_to_text.py
│           │       │   │   ├── convert_s2t_fairseq_to_tfms.py
│           │       │   │   ├── feature_extraction_speech_to_text.py
│           │       │   │   ├── modeling_speech_to_text.py
│           │       │   │   ├── modeling_tf_speech_to_text.py
│           │       │   │   ├── processing_speech_to_text.py
│           │       │   │   └── tokenization_speech_to_text.py
│           │       │   ├── speech_to_text_2/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_speech_to_text_2.py
│           │       │   │   ├── modeling_speech_to_text_2.py
│           │       │   │   ├── processing_speech_to_text_2.py
│           │       │   │   └── tokenization_speech_to_text_2.py
│           │       │   ├── splinter/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_splinter.py
│           │       │   │   ├── modeling_splinter.py
│           │       │   │   ├── tokenization_splinter.py
│           │       │   │   └── tokenization_splinter_fast.py
│           │       │   ├── squeezebert/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_squeezebert.py
│           │       │   │   ├── modeling_squeezebert.py
│           │       │   │   ├── tokenization_squeezebert.py
│           │       │   │   └── tokenization_squeezebert_fast.py
│           │       │   ├── swin/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_swin.py
│           │       │   │   ├── convert_swin_timm_to_pytorch.py
│           │       │   │   ├── modeling_swin.py
│           │       │   │   └── modeling_tf_swin.py
│           │       │   ├── swinv2/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_swinv2.py
│           │       │   │   ├── convert_swinv2_timm_to_pytorch.py
│           │       │   │   └── modeling_swinv2.py
│           │       │   ├── t5/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_t5.py
│           │       │   │   ├── convert_t5_original_tf_checkpoint_to_pytorch.py
│           │       │   │   ├── convert_t5x_checkpoint_to_flax.py
│           │       │   │   ├── download_from_gcp.sh
│           │       │   │   ├── modeling_flax_t5.py
│           │       │   │   ├── modeling_t5.py
│           │       │   │   ├── modeling_tf_t5.py
│           │       │   │   ├── tokenization_t5.py
│           │       │   │   └── tokenization_t5_fast.py
│           │       │   ├── table_transformer/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_table_transformer.py
│           │       │   │   ├── convert_table_transformer_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   └── modeling_table_transformer.py
│           │       │   ├── tapas/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_tapas.py
│           │       │   │   ├── convert_tapas_original_tf_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_tapas.py
│           │       │   │   ├── modeling_tf_tapas.py
│           │       │   │   └── tokenization_tapas.py
│           │       │   ├── tapex/
│           │       │   │   ├── __init__.py
│           │       │   │   └── tokenization_tapex.py
│           │       │   ├── time_series_transformer/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_time_series_transformer.py
│           │       │   │   └── modeling_time_series_transformer.py
│           │       │   ├── trajectory_transformer/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_trajectory_transformer.py
│           │       │   │   ├── convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   └── modeling_trajectory_transformer.py
│           │       │   ├── transfo_xl/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_transfo_xl.py
│           │       │   │   ├── convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_tf_transfo_xl.py
│           │       │   │   ├── modeling_tf_transfo_xl_utilities.py
│           │       │   │   ├── modeling_transfo_xl.py
│           │       │   │   ├── modeling_transfo_xl_utilities.py
│           │       │   │   └── tokenization_transfo_xl.py
│           │       │   ├── trocr/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_trocr.py
│           │       │   │   ├── convert_trocr_unilm_to_pytorch.py
│           │       │   │   ├── modeling_trocr.py
│           │       │   │   └── processing_trocr.py
│           │       │   ├── unispeech/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_unispeech.py
│           │       │   │   ├── convert_unispeech_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   └── modeling_unispeech.py
│           │       │   ├── unispeech_sat/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_unispeech_sat.py
│           │       │   │   ├── convert_unispeech_original_s3prl_checkpoint_to_pytorch.py
│           │       │   │   ├── convert_unispeech_sat_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   └── modeling_unispeech_sat.py
│           │       │   ├── van/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_van.py
│           │       │   │   ├── convert_van_to_pytorch.py
│           │       │   │   └── modeling_van.py
│           │       │   ├── videomae/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_videomae.py
│           │       │   │   ├── convert_videomae_to_pytorch.py
│           │       │   │   ├── feature_extraction_videomae.py
│           │       │   │   └── modeling_videomae.py
│           │       │   ├── vilt/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_vilt.py
│           │       │   │   ├── convert_vilt_original_to_pytorch.py
│           │       │   │   ├── feature_extraction_vilt.py
│           │       │   │   ├── modeling_vilt.py
│           │       │   │   └── processing_vilt.py
│           │       │   ├── vision_encoder_decoder/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_vision_encoder_decoder.py
│           │       │   │   ├── modeling_flax_vision_encoder_decoder.py
│           │       │   │   ├── modeling_tf_vision_encoder_decoder.py
│           │       │   │   └── modeling_vision_encoder_decoder.py
│           │       │   ├── vision_text_dual_encoder/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_vision_text_dual_encoder.py
│           │       │   │   ├── modeling_flax_vision_text_dual_encoder.py
│           │       │   │   ├── modeling_vision_text_dual_encoder.py
│           │       │   │   └── processing_vision_text_dual_encoder.py
│           │       │   ├── visual_bert/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_visual_bert.py
│           │       │   │   ├── convert_visual_bert_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   └── modeling_visual_bert.py
│           │       │   ├── vit/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_vit.py
│           │       │   │   ├── convert_dino_to_pytorch.py
│           │       │   │   ├── convert_vit_timm_to_pytorch.py
│           │       │   │   ├── feature_extraction_vit.py
│           │       │   │   ├── modeling_flax_vit.py
│           │       │   │   ├── modeling_tf_vit.py
│           │       │   │   └── modeling_vit.py
│           │       │   ├── vit_mae/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_vit_mae.py
│           │       │   │   ├── convert_vit_mae_to_pytorch.py
│           │       │   │   ├── modeling_tf_vit_mae.py
│           │       │   │   └── modeling_vit_mae.py
│           │       │   ├── vit_msn/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_vit_msn.py
│           │       │   │   ├── convert_msn_to_pytorch.py
│           │       │   │   └── modeling_vit_msn.py
│           │       │   ├── wav2vec2/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_wav2vec2.py
│           │       │   │   ├── convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   ├── convert_wav2vec2_original_s3prl_checkpoint_to_pytorch.py
│           │       │   │   ├── feature_extraction_wav2vec2.py
│           │       │   │   ├── modeling_flax_wav2vec2.py
│           │       │   │   ├── modeling_tf_wav2vec2.py
│           │       │   │   ├── modeling_wav2vec2.py
│           │       │   │   ├── processing_wav2vec2.py
│           │       │   │   └── tokenization_wav2vec2.py
│           │       │   ├── wav2vec2_conformer/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_wav2vec2_conformer.py
│           │       │   │   ├── convert_wav2vec2_conformer_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   └── modeling_wav2vec2_conformer.py
│           │       │   ├── wav2vec2_phoneme/
│           │       │   │   ├── __init__.py
│           │       │   │   └── tokenization_wav2vec2_phoneme.py
│           │       │   ├── wav2vec2_with_lm/
│           │       │   │   ├── __init__.py
│           │       │   │   └── processing_wav2vec2_with_lm.py
│           │       │   ├── wavlm/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_wavlm.py
│           │       │   │   ├── convert_wavlm_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   ├── convert_wavlm_original_s3prl_checkpoint_to_pytorch.py
│           │       │   │   └── modeling_wavlm.py
│           │       │   ├── whisper/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_whisper.py
│           │       │   │   ├── english_normalizer.py
│           │       │   │   ├── feature_extraction_whisper.py
│           │       │   │   ├── modeling_tf_whisper.py
│           │       │   │   ├── modeling_whisper.py
│           │       │   │   ├── processing_whisper.py
│           │       │   │   └── tokenization_whisper.py
│           │       │   ├── x_clip/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_x_clip.py
│           │       │   │   ├── convert_x_clip_original_pytorch_to_hf.py
│           │       │   │   ├── modeling_x_clip.py
│           │       │   │   └── processing_x_clip.py
│           │       │   ├── xglm/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_xglm.py
│           │       │   │   ├── convert_xglm_original_ckpt_to_trfms.py
│           │       │   │   ├── modeling_flax_xglm.py
│           │       │   │   ├── modeling_tf_xglm.py
│           │       │   │   ├── modeling_xglm.py
│           │       │   │   ├── tokenization_xglm.py
│           │       │   │   └── tokenization_xglm_fast.py
│           │       │   ├── xlm/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_xlm.py
│           │       │   │   ├── convert_xlm_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_tf_xlm.py
│           │       │   │   ├── modeling_xlm.py
│           │       │   │   └── tokenization_xlm.py
│           │       │   ├── xlm_prophetnet/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_xlm_prophetnet.py
│           │       │   │   ├── modeling_xlm_prophetnet.py
│           │       │   │   └── tokenization_xlm_prophetnet.py
│           │       │   ├── xlm_roberta/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_xlm_roberta.py
│           │       │   │   ├── modeling_flax_xlm_roberta.py
│           │       │   │   ├── modeling_tf_xlm_roberta.py
│           │       │   │   ├── modeling_xlm_roberta.py
│           │       │   │   ├── tokenization_xlm_roberta.py
│           │       │   │   └── tokenization_xlm_roberta_fast.py
│           │       │   ├── xlm_roberta_xl/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_xlm_roberta_xl.py
│           │       │   │   ├── convert_xlm_roberta_xl_original_pytorch_checkpoint_to_pytorch.py
│           │       │   │   └── modeling_xlm_roberta_xl.py
│           │       │   ├── xlnet/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_xlnet.py
│           │       │   │   ├── convert_xlnet_original_tf_checkpoint_to_pytorch.py
│           │       │   │   ├── modeling_tf_xlnet.py
│           │       │   │   ├── modeling_xlnet.py
│           │       │   │   ├── tokenization_xlnet.py
│           │       │   │   └── tokenization_xlnet_fast.py
│           │       │   ├── yolos/
│           │       │   │   ├── __init__.py
│           │       │   │   ├── configuration_yolos.py
│           │       │   │   ├── convert_yolos_to_pytorch.py
│           │       │   │   ├── feature_extraction_yolos.py
│           │       │   │   └── modeling_yolos.py
│           │       │   └── yoso/
│           │       │       ├── __init__.py
│           │       │       ├── common.h
│           │       │       ├── common_cuda.h
│           │       │       ├── common_cuda_device.h
│           │       │       ├── configuration_yoso.py
│           │       │       ├── convert_yoso_pytorch_to_pytorch.py
│           │       │       ├── fast_lsh_cumulation.cu
│           │       │       ├── fast_lsh_cumulation.h
│           │       │       ├── fast_lsh_cumulation_cuda.cu
│           │       │       ├── fast_lsh_cumulation_cuda.h
│           │       │       ├── fast_lsh_cumulation_torch.cpp
│           │       │       └── modeling_yoso.py
│           │       ├── onnx/
│           │       │   ├── __init__.py
│           │       │   ├── __main__.py
│           │       │   ├── config.py
│           │       │   ├── convert.py
│           │       │   ├── features.py
│           │       │   └── utils.py
│           │       ├── optimization.py
│           │       ├── optimization_tf.py
│           │       ├── pipelines/
│           │       │   ├── __init__.py
│           │       │   ├── audio_classification.py
│           │       │   ├── audio_utils.py
│           │       │   ├── automatic_speech_recognition.py
│           │       │   ├── base.py
│           │       │   ├── conversational.py
│           │       │   ├── depth_estimation.py
│           │       │   ├── document_question_answering.py
│           │       │   ├── feature_extraction.py
│           │       │   ├── fill_mask.py
│           │       │   ├── image_classification.py
│           │       │   ├── image_segmentation.py
│           │       │   ├── image_to_text.py
│           │       │   ├── object_detection.py
│           │       │   ├── pt_utils.py
│           │       │   ├── question_answering.py
│           │       │   ├── table_question_answering.py
│           │       │   ├── text2text_generation.py
│           │       │   ├── text_classification.py
│           │       │   ├── text_generation.py
│           │       │   ├── token_classification.py
│           │       │   ├── visual_question_answering.py
│           │       │   ├── zero_shot_classification.py
│           │       │   ├── zero_shot_image_classification.py
│           │       │   └── zero_shot_object_detection.py
│           │       ├── processing_utils.py
│           │       ├── pytorch_utils.py
│           │       ├── sagemaker/
│           │       │   ├── __init__.py
│           │       │   ├── trainer_sm.py
│           │       │   └── training_args_sm.py
│           │       ├── testing_utils.py
│           │       ├── tf_utils.py
│           │       ├── tokenization_utils.py
│           │       ├── tokenization_utils_base.py
│           │       ├── tokenization_utils_fast.py
│           │       ├── trainer.py
│           │       ├── trainer_callback.py
│           │       ├── trainer_pt_utils.py
│           │       ├── trainer_seq2seq.py
│           │       ├── trainer_tf.py
│           │       ├── trainer_utils.py
│           │       ├── training_args.py
│           │       ├── training_args_seq2seq.py
│           │       ├── training_args_tf.py
│           │       └── utils/
│           │           ├── __init__.py
│           │           ├── bitsandbytes.py
│           │           ├── constants.py
│           │           ├── doc.py
│           │           ├── dummy_detectron2_objects.py
│           │           ├── dummy_flax_objects.py
│           │           ├── dummy_pt_objects.py
│           │           ├── dummy_scatter_objects.py
│           │           ├── dummy_sentencepiece_and_speech_objects.py
│           │           ├── dummy_sentencepiece_and_tokenizers_objects.py
│           │           ├── dummy_sentencepiece_objects.py
│           │           ├── dummy_speech_objects.py
│           │           ├── dummy_tensorflow_text_objects.py
│           │           ├── dummy_tf_objects.py
│           │           ├── dummy_timm_and_vision_objects.py
│           │           ├── dummy_tokenizers_objects.py
│           │           ├── dummy_vision_objects.py
│           │           ├── fx.py
│           │           ├── generic.py
│           │           ├── hp_naming.py
│           │           ├── hub.py
│           │           ├── import_utils.py
│           │           ├── logging.py
│           │           ├── model_parallel_utils.py
│           │           ├── notebook.py
│           │           ├── sentencepiece_model_pb2.py
│           │           └── versions.py
│           ├── templates/
│           │   ├── adding_a_missing_tokenization_test/
│           │   │   ├── README.md
│           │   │   ├── cookiecutter-template-{{cookiecutter.modelname}}/
│           │   │   │   └── test_tokenization_{{cookiecutter.lowercase_modelname}}.py
│           │   │   └── cookiecutter.json
│           │   ├── adding_a_new_example_script/
│           │   │   ├── README.md
│           │   │   ├── cookiecutter.json
│           │   │   └── {{cookiecutter.directory_name}}/
│           │   │       └── run_{{cookiecutter.example_shortcut}}.py
│           │   └── adding_a_new_model/
│           │       ├── ADD_NEW_MODEL_PROPOSAL_TEMPLATE.md
│           │       ├── README.md
│           │       ├── cookiecutter-template-{{cookiecutter.modelname}}/
│           │       │   ├── __init__.py
│           │       │   ├── configuration.json
│           │       │   ├── configuration_{{cookiecutter.lowercase_modelname}}.py
│           │       │   ├── modeling_flax_{{cookiecutter.lowercase_modelname}}.py
│           │       │   ├── modeling_tf_{{cookiecutter.lowercase_modelname}}.py
│           │       │   ├── modeling_{{cookiecutter.lowercase_modelname}}.py
│           │       │   ├── test_modeling_flax_{{cookiecutter.lowercase_modelname}}.py
│           │       │   ├── test_modeling_tf_{{cookiecutter.lowercase_modelname}}.py
│           │       │   ├── test_modeling_{{cookiecutter.lowercase_modelname}}.py
│           │       │   ├── to_replace_{{cookiecutter.lowercase_modelname}}.py
│           │       │   ├── tokenization_fast_{{cookiecutter.lowercase_modelname}}.py
│           │       │   ├── tokenization_{{cookiecutter.lowercase_modelname}}.py
│           │       │   └── {{cookiecutter.lowercase_modelname}}.mdx
│           │       ├── cookiecutter.json
│           │       ├── open_model_proposals/
│           │       │   ├── ADD_BIG_BIRD.md
│           │       │   └── README.md
│           │       └── tests/
│           │           ├── encoder-bert-tokenizer.json
│           │           ├── flax-encoder-bert-tokenizer.json
│           │           ├── flax-seq-2-seq-bart-tokenizer.json
│           │           ├── pt-encoder-bert-tokenizer.json
│           │           ├── pt-seq-2-seq-bart-tokenizer.json
│           │           ├── standalone.json
│           │           ├── tf-encoder-bert-tokenizer.json
│           │           └── tf-seq-2-seq-bart-tokenizer.json
│           ├── tests/
│           │   ├── __init__.py
│           │   ├── benchmark/
│           │   │   ├── __init__.py
│           │   │   ├── test_benchmark.py
│           │   │   └── test_benchmark_tf.py
│           │   ├── deepspeed/
│           │   │   ├── ds_config_zero2.json
│           │   │   ├── ds_config_zero3.json
│           │   │   ├── test_deepspeed.py
│           │   │   ├── test_model_zoo.py
│           │   │   └── vit_feature_extractor.json
│           │   ├── extended/
│           │   │   └── test_trainer_ext.py
│           │   ├── fixtures/
│           │   │   ├── add_distilbert_like_config.json
│           │   │   ├── dummy-config.json
│           │   │   ├── dummy_feature_extractor_config.json
│           │   │   ├── empty.txt
│           │   │   ├── input.txt
│           │   │   ├── merges.txt
│           │   │   ├── preprocessor_config.json
│           │   │   ├── sample_text.txt
│           │   │   ├── sample_text_no_unicode.txt
│           │   │   ├── spiece.model
│           │   │   ├── test_entity_vocab.json
│           │   │   ├── test_sentencepiece.model
│           │   │   ├── test_sentencepiece_bpe.model
│           │   │   ├── test_sentencepiece_no_bos.model
│           │   │   ├── test_sentencepiece_with_bytefallback.model
│           │   │   ├── tests_samples/
│           │   │   │   ├── .gitignore
│           │   │   │   ├── COCO/
│           │   │   │   │   ├── coco_annotations.txt
│           │   │   │   │   └── coco_panoptic_annotations.txt
│           │   │   │   ├── GermEval/
│           │   │   │   │   ├── dev.txt
│           │   │   │   │   ├── labels.txt
│           │   │   │   │   └── train.txt
│           │   │   │   ├── MRPC/
│           │   │   │   │   ├── dev.csv
│           │   │   │   │   ├── dev.tsv
│           │   │   │   │   ├── train.csv
│           │   │   │   │   └── train.tsv
│           │   │   │   ├── SQUAD/
│           │   │   │   │   └── sample.json
│           │   │   │   ├── STS-B/
│           │   │   │   │   ├── dev.tsv
│           │   │   │   │   └── train.tsv
│           │   │   │   ├── conll/
│           │   │   │   │   └── sample.json
│           │   │   │   ├── swag/
│           │   │   │   │   └── sample.json
│           │   │   │   ├── wiki_text/
│           │   │   │   │   └── wiki_00
│           │   │   │   ├── wmt16/
│           │   │   │   │   └── sample.json
│           │   │   │   ├── wmt_en_ro/
│           │   │   │   │   ├── test.json
│           │   │   │   │   ├── train.json
│           │   │   │   │   └── val.json
│           │   │   │   └── xsum/
│           │   │   │       └── sample.json
│           │   │   ├── vocab.json
│           │   │   └── vocab.txt
│           │   ├── generation/
│           │   │   ├── __init__.py
│           │   │   ├── test_generation_beam_constraints.py
│           │   │   ├── test_generation_beam_search.py
│           │   │   ├── test_generation_flax_logits_process.py
│           │   │   ├── test_generation_flax_utils.py
│           │   │   ├── test_generation_logits_process.py
│           │   │   ├── test_generation_stopping_criteria.py
│           │   │   ├── test_generation_tf_logits_process.py
│           │   │   ├── test_generation_tf_utils.py
│           │   │   └── test_generation_utils.py
│           │   ├── mixed_int8/
│           │   │   ├── README.md
│           │   │   ├── __init__.py
│           │   │   └── test_mixed_int8.py
│           │   ├── models/
│           │   │   ├── __init__.py
│           │   │   ├── albert/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_albert.py
│           │   │   │   ├── test_modeling_flax_albert.py
│           │   │   │   ├── test_modeling_tf_albert.py
│           │   │   │   └── test_tokenization_albert.py
│           │   │   ├── auto/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_configuration_auto.py
│           │   │   │   ├── test_feature_extraction_auto.py
│           │   │   │   ├── test_modeling_auto.py
│           │   │   │   ├── test_modeling_flax_auto.py
│           │   │   │   ├── test_modeling_tf_auto.py
│           │   │   │   ├── test_modeling_tf_pytorch.py
│           │   │   │   ├── test_processor_auto.py
│           │   │   │   └── test_tokenization_auto.py
│           │   │   ├── bart/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_bart.py
│           │   │   │   ├── test_modeling_flax_bart.py
│           │   │   │   ├── test_modeling_tf_bart.py
│           │   │   │   └── test_tokenization_bart.py
│           │   │   ├── barthez/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_tokenization_barthez.py
│           │   │   ├── bartpho/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_tokenization_bartpho.py
│           │   │   ├── beit/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_beit.py
│           │   │   │   ├── test_modeling_beit.py
│           │   │   │   └── test_modeling_flax_beit.py
│           │   │   ├── bert/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_bert.py
│           │   │   │   ├── test_modeling_flax_bert.py
│           │   │   │   ├── test_modeling_tf_bert.py
│           │   │   │   ├── test_tokenization_bert.py
│           │   │   │   └── test_tokenization_bert_tf.py
│           │   │   ├── bert_generation/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_bert_generation.py
│           │   │   │   └── test_tokenization_bert_generation.py
│           │   │   ├── bert_japanese/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_tokenization_bert_japanese.py
│           │   │   ├── bertweet/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_tokenization_bertweet.py
│           │   │   ├── big_bird/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_big_bird.py
│           │   │   │   ├── test_modeling_flax_big_bird.py
│           │   │   │   └── test_tokenization_big_bird.py
│           │   │   ├── bigbird_pegasus/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_bigbird_pegasus.py
│           │   │   ├── blenderbot/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_blenderbot.py
│           │   │   │   ├── test_modeling_flax_blenderbot.py
│           │   │   │   ├── test_modeling_tf_blenderbot.py
│           │   │   │   └── test_tokenization_blenderbot.py
│           │   │   ├── blenderbot_small/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_blenderbot_small.py
│           │   │   │   ├── test_modeling_flax_blenderbot_small.py
│           │   │   │   ├── test_modeling_tf_blenderbot_small.py
│           │   │   │   └── test_tokenization_blenderbot_small.py
│           │   │   ├── bloom/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_bloom.py
│           │   │   │   └── test_tokenization_bloom.py
│           │   │   ├── bort/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_bort.py
│           │   │   │   └── test_modeling_tf_bort.py
│           │   │   ├── byt5/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_tokenization_byt5.py
│           │   │   ├── camembert/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_camembert.py
│           │   │   │   ├── test_modeling_tf_camembert.py
│           │   │   │   └── test_tokenization_camembert.py
│           │   │   ├── canine/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_canine.py
│           │   │   │   └── test_tokenization_canine.py
│           │   │   ├── clip/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_clip.py
│           │   │   │   ├── test_modeling_clip.py
│           │   │   │   ├── test_modeling_flax_clip.py
│           │   │   │   ├── test_modeling_tf_clip.py
│           │   │   │   ├── test_processor_clip.py
│           │   │   │   └── test_tokenization_clip.py
│           │   │   ├── codegen/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_codegen.py
│           │   │   │   └── test_tokenization_codegen.py
│           │   │   ├── conditional_detr/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_conditional_detr.py
│           │   │   │   └── test_modeling_conditional_detr.py
│           │   │   ├── convbert/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_convbert.py
│           │   │   │   └── test_modeling_tf_convbert.py
│           │   │   ├── convnext/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_convnext.py
│           │   │   │   ├── test_modeling_convnext.py
│           │   │   │   └── test_modeling_tf_convnext.py
│           │   │   ├── cpm/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_tokenization_cpm.py
│           │   │   ├── ctrl/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_ctrl.py
│           │   │   │   ├── test_modeling_tf_ctrl.py
│           │   │   │   └── test_tokenization_ctrl.py
│           │   │   ├── cvt/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_cvt.py
│           │   │   │   └── test_modeling_tf_cvt.py
│           │   │   ├── data2vec/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_data2vec_audio.py
│           │   │   │   ├── test_modeling_data2vec_text.py
│           │   │   │   ├── test_modeling_data2vec_vision.py
│           │   │   │   └── test_modeling_tf_data2vec_vision.py
│           │   │   ├── deberta/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_deberta.py
│           │   │   │   ├── test_modeling_tf_deberta.py
│           │   │   │   └── test_tokenization_deberta.py
│           │   │   ├── deberta_v2/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_deberta_v2.py
│           │   │   │   ├── test_modeling_tf_deberta_v2.py
│           │   │   │   └── test_tokenization_deberta_v2.py
│           │   │   ├── decision_transformer/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_decision_transformer.py
│           │   │   ├── deformable_detr/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_deformable_detr.py
│           │   │   │   └── test_modeling_deformable_detr.py
│           │   │   ├── deit/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_deit.py
│           │   │   │   ├── test_modeling_deit.py
│           │   │   │   └── test_modeling_tf_deit.py
│           │   │   ├── detr/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_detr.py
│           │   │   │   └── test_modeling_detr.py
│           │   │   ├── distilbert/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_distilbert.py
│           │   │   │   ├── test_modeling_flax_distilbert.py
│           │   │   │   ├── test_modeling_tf_distilbert.py
│           │   │   │   └── test_tokenization_distilbert.py
│           │   │   ├── dit/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_dit.py
│           │   │   ├── donut/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_donut.py
│           │   │   │   └── test_modeling_donut_swin.py
│           │   │   ├── dpr/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_dpr.py
│           │   │   │   ├── test_modeling_tf_dpr.py
│           │   │   │   └── test_tokenization_dpr.py
│           │   │   ├── dpt/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_dpt.py
│           │   │   │   └── test_modeling_dpt.py
│           │   │   ├── electra/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_electra.py
│           │   │   │   ├── test_modeling_flax_electra.py
│           │   │   │   └── test_modeling_tf_electra.py
│           │   │   ├── encoder_decoder/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_encoder_decoder.py
│           │   │   │   ├── test_modeling_flax_encoder_decoder.py
│           │   │   │   └── test_modeling_tf_encoder_decoder.py
│           │   │   ├── ernie/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_ernie.py
│           │   │   ├── esm/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_esm.py
│           │   │   │   ├── test_modeling_esmfold.py
│           │   │   │   ├── test_modeling_tf_esm.py
│           │   │   │   └── test_tokenization_esm.py
│           │   │   ├── flaubert/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_flaubert.py
│           │   │   │   └── test_modeling_tf_flaubert.py
│           │   │   ├── flava/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_flava.py
│           │   │   │   ├── test_modeling_flava.py
│           │   │   │   └── test_processor_flava.py
│           │   │   ├── fnet/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_fnet.py
│           │   │   │   └── test_tokenization_fnet.py
│           │   │   ├── fsmt/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_fsmt.py
│           │   │   │   └── test_tokenization_fsmt.py
│           │   │   ├── funnel/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_funnel.py
│           │   │   │   ├── test_modeling_tf_funnel.py
│           │   │   │   └── test_tokenization_funnel.py
│           │   │   ├── glpn/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_glpn.py
│           │   │   │   └── test_modeling_glpn.py
│           │   │   ├── gpt2/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_flax_gpt2.py
│           │   │   │   ├── test_modeling_gpt2.py
│           │   │   │   ├── test_modeling_tf_gpt2.py
│           │   │   │   └── test_tokenization_gpt2.py
│           │   │   ├── gpt_neo/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_flax_gpt_neo.py
│           │   │   │   └── test_modeling_gpt_neo.py
│           │   │   ├── gpt_neox/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_gpt_neox.py
│           │   │   ├── gpt_neox_japanese/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_gpt_neox_japanese.py
│           │   │   │   └── test_tokenization_gpt_neox_japanese.py
│           │   │   ├── gptj/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_flax_gptj.py
│           │   │   │   ├── test_modeling_gptj.py
│           │   │   │   └── test_modeling_tf_gptj.py
│           │   │   ├── groupvit/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_groupvit.py
│           │   │   │   └── test_modeling_tf_groupvit.py
│           │   │   ├── herbert/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_tokenization_herbert.py
│           │   │   ├── hubert/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_hubert.py
│           │   │   │   └── test_modeling_tf_hubert.py
│           │   │   ├── ibert/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_ibert.py
│           │   │   ├── imagegpt/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_imagegpt.py
│           │   │   │   └── test_modeling_imagegpt.py
│           │   │   ├── layoutlm/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_layoutlm.py
│           │   │   │   ├── test_modeling_tf_layoutlm.py
│           │   │   │   └── test_tokenization_layoutlm.py
│           │   │   ├── layoutlmv2/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_layoutlmv2.py
│           │   │   │   ├── test_modeling_layoutlmv2.py
│           │   │   │   ├── test_processor_layoutlmv2.py
│           │   │   │   └── test_tokenization_layoutlmv2.py
│           │   │   ├── layoutlmv3/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_layoutlmv3.py
│           │   │   │   ├── test_modeling_layoutlmv3.py
│           │   │   │   ├── test_modeling_tf_layoutlmv3.py
│           │   │   │   ├── test_processor_layoutlmv3.py
│           │   │   │   └── test_tokenization_layoutlmv3.py
│           │   │   ├── layoutxlm/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_processor_layoutxlm.py
│           │   │   │   └── test_tokenization_layoutxlm.py
│           │   │   ├── led/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_led.py
│           │   │   │   └── test_modeling_tf_led.py
│           │   │   ├── levit/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_levit.py
│           │   │   │   └── test_modeling_levit.py
│           │   │   ├── lilt/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_lilt.py
│           │   │   ├── longformer/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_longformer.py
│           │   │   │   ├── test_modeling_tf_longformer.py
│           │   │   │   └── test_tokenization_longformer.py
│           │   │   ├── longt5/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_flax_longt5.py
│           │   │   │   └── test_modeling_longt5.py
│           │   │   ├── luke/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_luke.py
│           │   │   │   └── test_tokenization_luke.py
│           │   │   ├── lxmert/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_lxmert.py
│           │   │   │   ├── test_modeling_tf_lxmert.py
│           │   │   │   └── test_tokenization_lxmert.py
│           │   │   ├── m2m_100/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_m2m_100.py
│           │   │   │   └── test_tokenization_m2m_100.py
│           │   │   ├── marian/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_flax_marian.py
│           │   │   │   ├── test_modeling_marian.py
│           │   │   │   ├── test_modeling_tf_marian.py
│           │   │   │   └── test_tokenization_marian.py
│           │   │   ├── markuplm/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_markuplm.py
│           │   │   │   ├── test_modeling_markuplm.py
│           │   │   │   ├── test_processor_markuplm.py
│           │   │   │   └── test_tokenization_markuplm.py
│           │   │   ├── maskformer/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_maskformer.py
│           │   │   │   └── test_modeling_maskformer.py
│           │   │   ├── mbart/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_flax_mbart.py
│           │   │   │   ├── test_modeling_mbart.py
│           │   │   │   ├── test_modeling_tf_mbart.py
│           │   │   │   └── test_tokenization_mbart.py
│           │   │   ├── mbart50/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_tokenization_mbart50.py
│           │   │   ├── mctct/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_mctct.py
│           │   │   │   ├── test_modeling_mctct.py
│           │   │   │   └── test_processor_mctct.py
│           │   │   ├── megatron_bert/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_megatron_bert.py
│           │   │   ├── megatron_gpt2/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_megatron_gpt2.py
│           │   │   ├── mluke/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_tokenization_mluke.py
│           │   │   ├── mobilebert/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_mobilebert.py
│           │   │   │   ├── test_modeling_tf_mobilebert.py
│           │   │   │   └── test_tokenization_mobilebert.py
│           │   │   ├── mobilevit/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_mobilevit.py
│           │   │   │   ├── test_modeling_mobilevit.py
│           │   │   │   └── test_modeling_tf_mobilevit.py
│           │   │   ├── mpnet/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_mpnet.py
│           │   │   │   ├── test_modeling_tf_mpnet.py
│           │   │   │   └── test_tokenization_mpnet.py
│           │   │   ├── mt5/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_flax_mt5.py
│           │   │   │   ├── test_modeling_mt5.py
│           │   │   │   └── test_modeling_tf_mt5.py
│           │   │   ├── mvp/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_mvp.py
│           │   │   │   └── test_tokenization_mvp.py
│           │   │   ├── nezha/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_nezha.py
│           │   │   ├── nllb/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_tokenization_nllb.py
│           │   │   ├── nystromformer/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_nystromformer.py
│           │   │   ├── openai/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_openai.py
│           │   │   │   ├── test_modeling_tf_openai.py
│           │   │   │   └── test_tokenization_openai.py
│           │   │   ├── opt/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_flax_opt.py
│           │   │   │   ├── test_modeling_opt.py
│           │   │   │   └── test_modeling_tf_opt.py
│           │   │   ├── owlvit/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_owlvit.py
│           │   │   │   ├── test_modeling_owlvit.py
│           │   │   │   └── test_processor_owlvit.py
│           │   │   ├── pegasus/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_flax_pegasus.py
│           │   │   │   ├── test_modeling_pegasus.py
│           │   │   │   ├── test_modeling_tf_pegasus.py
│           │   │   │   └── test_tokenization_pegasus.py
│           │   │   ├── pegasus_x/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_pegasus_x.py
│           │   │   ├── perceiver/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_perceiver.py
│           │   │   │   └── test_tokenization_perceiver.py
│           │   │   ├── phobert/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_tokenization_phobert.py
│           │   │   ├── plbart/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_plbart.py
│           │   │   │   └── test_tokenization_plbart.py
│           │   │   ├── poolformer/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_poolformer.py
│           │   │   │   └── test_modeling_poolformer.py
│           │   │   ├── prophetnet/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_prophetnet.py
│           │   │   │   └── test_tokenization_prophetnet.py
│           │   │   ├── qdqbert/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_qdqbert.py
│           │   │   ├── rag/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_rag.py
│           │   │   │   ├── test_modeling_tf_rag.py
│           │   │   │   ├── test_retrieval_rag.py
│           │   │   │   └── test_tokenization_rag.py
│           │   │   ├── realm/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_realm.py
│           │   │   │   ├── test_retrieval_realm.py
│           │   │   │   └── test_tokenization_realm.py
│           │   │   ├── reformer/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_reformer.py
│           │   │   │   └── test_tokenization_reformer.py
│           │   │   ├── regnet/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_regnet.py
│           │   │   │   └── test_modeling_tf_regnet.py
│           │   │   ├── rembert/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_rembert.py
│           │   │   │   └── test_modeling_tf_rembert.py
│           │   │   ├── resnet/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_resnet.py
│           │   │   │   └── test_modeling_tf_resnet.py
│           │   │   ├── retribert/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_tokenization_retribert.py
│           │   │   ├── roberta/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_flax_roberta.py
│           │   │   │   ├── test_modeling_roberta.py
│           │   │   │   ├── test_modeling_tf_roberta.py
│           │   │   │   └── test_tokenization_roberta.py
│           │   │   ├── roformer/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_flax_roformer.py
│           │   │   │   ├── test_modeling_roformer.py
│           │   │   │   ├── test_modeling_tf_roformer.py
│           │   │   │   └── test_tokenization_roformer.py
│           │   │   ├── segformer/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_segformer.py
│           │   │   │   ├── test_modeling_segformer.py
│           │   │   │   └── test_modeling_tf_segformer.py
│           │   │   ├── sew/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_sew.py
│           │   │   ├── sew_d/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_sew_d.py
│           │   │   ├── speech_encoder_decoder/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_flax_speech_encoder_decoder.py
│           │   │   │   └── test_modeling_speech_encoder_decoder.py
│           │   │   ├── speech_to_text/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_speech_to_text.py
│           │   │   │   ├── test_modeling_speech_to_text.py
│           │   │   │   ├── test_modeling_tf_speech_to_text.py
│           │   │   │   ├── test_processor_speech_to_text.py
│           │   │   │   └── test_tokenization_speech_to_text.py
│           │   │   ├── speech_to_text_2/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_speech_to_text_2.py
│           │   │   │   └── test_tokenization_speech_to_text_2.py
│           │   │   ├── splinter/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_splinter.py
│           │   │   ├── squeezebert/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_squeezebert.py
│           │   │   │   └── test_tokenization_squeezebert.py
│           │   │   ├── swin/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_swin.py
│           │   │   │   └── test_modeling_tf_swin.py
│           │   │   ├── swinv2/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_swinv2.py
│           │   │   ├── t5/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_flax_t5.py
│           │   │   │   ├── test_modeling_t5.py
│           │   │   │   ├── test_modeling_tf_t5.py
│           │   │   │   └── test_tokenization_t5.py
│           │   │   ├── table_transformer/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_table_transformer.py
│           │   │   ├── tapas/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_tapas.py
│           │   │   │   ├── test_modeling_tf_tapas.py
│           │   │   │   └── test_tokenization_tapas.py
│           │   │   ├── tapex/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_tokenization_tapex.py
│           │   │   ├── time_series_transformer/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_time_series_transformer.py
│           │   │   ├── trajectory_transformer/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_trajectory_transformer.py
│           │   │   ├── transfo_xl/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_tf_transfo_xl.py
│           │   │   │   ├── test_modeling_transfo_xl.py
│           │   │   │   └── test_tokenization_transfo_xl.py
│           │   │   ├── trocr/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_trocr.py
│           │   │   ├── unispeech/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_unispeech.py
│           │   │   ├── unispeech_sat/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_unispeech_sat.py
│           │   │   ├── van/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_van.py
│           │   │   ├── videomae/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_videomae.py
│           │   │   │   └── test_modeling_videomae.py
│           │   │   ├── vilt/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_vilt.py
│           │   │   │   └── test_modeling_vilt.py
│           │   │   ├── vision_encoder_decoder/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_flax_vision_encoder_decoder.py
│           │   │   │   ├── test_modeling_tf_vision_encoder_decoder.py
│           │   │   │   └── test_modeling_vision_encoder_decoder.py
│           │   │   ├── vision_text_dual_encoder/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_flax_vision_text_dual_encoder.py
│           │   │   │   ├── test_modeling_vision_text_dual_encoder.py
│           │   │   │   └── test_processor_vision_text_dual_encoder.py
│           │   │   ├── visual_bert/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_visual_bert.py
│           │   │   ├── vit/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_vit.py
│           │   │   │   ├── test_modeling_flax_vit.py
│           │   │   │   ├── test_modeling_tf_vit.py
│           │   │   │   └── test_modeling_vit.py
│           │   │   ├── vit_mae/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_tf_vit_mae.py
│           │   │   │   └── test_modeling_vit_mae.py
│           │   │   ├── vit_msn/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_vit_msn.py
│           │   │   ├── wav2vec2/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_wav2vec2.py
│           │   │   │   ├── test_modeling_flax_wav2vec2.py
│           │   │   │   ├── test_modeling_tf_wav2vec2.py
│           │   │   │   ├── test_modeling_wav2vec2.py
│           │   │   │   ├── test_processor_wav2vec2.py
│           │   │   │   └── test_tokenization_wav2vec2.py
│           │   │   ├── wav2vec2_conformer/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_wav2vec2_conformer.py
│           │   │   ├── wav2vec2_phoneme/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_tokenization_wav2vec2_phoneme.py
│           │   │   ├── wav2vec2_with_lm/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_processor_wav2vec2_with_lm.py
│           │   │   ├── wavlm/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_wavlm.py
│           │   │   ├── whisper/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_whisper.py
│           │   │   │   ├── test_modeling_tf_whisper.py
│           │   │   │   ├── test_modeling_whisper.py
│           │   │   │   ├── test_processor_whisper.py
│           │   │   │   └── test_tokenization_whisper.py
│           │   │   ├── x_clip/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_x_clip.py
│           │   │   ├── xglm/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_flax_xglm.py
│           │   │   │   ├── test_modeling_tf_xglm.py
│           │   │   │   ├── test_modeling_xglm.py
│           │   │   │   └── test_tokenization_xglm.py
│           │   │   ├── xlm/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_tf_xlm.py
│           │   │   │   ├── test_modeling_xlm.py
│           │   │   │   └── test_tokenization_xlm.py
│           │   │   ├── xlm_prophetnet/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_xlm_prophetnet.py
│           │   │   │   └── test_tokenization_xlm_prophetnet.py
│           │   │   ├── xlm_roberta/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_flax_xlm_roberta.py
│           │   │   │   ├── test_modeling_tf_xlm_roberta.py
│           │   │   │   ├── test_modeling_xlm_roberta.py
│           │   │   │   └── test_tokenization_xlm_roberta.py
│           │   │   ├── xlm_roberta_xl/
│           │   │   │   ├── __init__.py
│           │   │   │   └── test_modeling_xlm_roberta_xl.py
│           │   │   ├── xlnet/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_modeling_tf_xlnet.py
│           │   │   │   ├── test_modeling_xlnet.py
│           │   │   │   └── test_tokenization_xlnet.py
│           │   │   ├── yolos/
│           │   │   │   ├── __init__.py
│           │   │   │   ├── test_feature_extraction_yolos.py
│           │   │   │   └── test_modeling_yolos.py
│           │   │   └── yoso/
│           │   │       ├── __init__.py
│           │   │       └── test_modeling_yoso.py
│           │   ├── onnx/
│           │   │   ├── __init__.py
│           │   │   ├── test_features.py
│           │   │   ├── test_onnx.py
│           │   │   └── test_onnx_v2.py
│           │   ├── optimization/
│           │   │   ├── __init__.py
│           │   │   ├── test_optimization.py
│           │   │   └── test_optimization_tf.py
│           │   ├── pipelines/
│           │   │   ├── __init__.py
│           │   │   ├── test_pipelines_audio_classification.py
│           │   │   ├── test_pipelines_automatic_speech_recognition.py
│           │   │   ├── test_pipelines_common.py
│           │   │   ├── test_pipelines_conversational.py
│           │   │   ├── test_pipelines_depth_estimation.py
│           │   │   ├── test_pipelines_document_question_answering.py
│           │   │   ├── test_pipelines_feature_extraction.py
│           │   │   ├── test_pipelines_fill_mask.py
│           │   │   ├── test_pipelines_image_classification.py
│           │   │   ├── test_pipelines_image_segmentation.py
│           │   │   ├── test_pipelines_image_to_text.py
│           │   │   ├── test_pipelines_object_detection.py
│           │   │   ├── test_pipelines_question_answering.py
│           │   │   ├── test_pipelines_summarization.py
│           │   │   ├── test_pipelines_table_question_answering.py
│           │   │   ├── test_pipelines_text2text_generation.py
│           │   │   ├── test_pipelines_text_classification.py
│           │   │   ├── test_pipelines_text_generation.py
│           │   │   ├── test_pipelines_token_classification.py
│           │   │   ├── test_pipelines_translation.py
│           │   │   ├── test_pipelines_visual_question_answering.py
│           │   │   ├── test_pipelines_zero_shot.py
│           │   │   ├── test_pipelines_zero_shot_image_classification.py
│           │   │   └── test_pipelines_zero_shot_object_detection.py
│           │   ├── repo_utils/
│           │   │   ├── test_check_copies.py
│           │   │   ├── test_check_dummies.py
│           │   │   └── test_tests_fetcher.py
│           │   ├── sagemaker/
│           │   │   ├── README.md
│           │   │   ├── __init__.py
│           │   │   ├── conftest.py
│           │   │   ├── scripts/
│           │   │   │   ├── pytorch/
│           │   │   │   │   ├── requirements.txt
│           │   │   │   │   ├── run_ddp.py
│           │   │   │   │   └── run_glue_model_parallelism.py
│           │   │   │   └── tensorflow/
│           │   │   │       ├── requirements.txt
│           │   │   │       ├── run_tf.py
│           │   │   │       └── run_tf_dist.py
│           │   │   ├── test_multi_node_data_parallel.py
│           │   │   ├── test_multi_node_model_parallel.py
│           │   │   └── test_single_node_gpu.py
│           │   ├── test_configuration_common.py
│           │   ├── test_feature_extraction_common.py
│           │   ├── test_image_transforms.py
│           │   ├── test_modeling_common.py
│           │   ├── test_modeling_flax_common.py
│           │   ├── test_modeling_tf_common.py
│           │   ├── test_sequence_feature_extraction_common.py
│           │   ├── test_tokenization_common.py
│           │   ├── tokenization/
│           │   │   ├── __init__.py
│           │   │   ├── test_tokenization_fast.py
│           │   │   └── test_tokenization_utils.py
│           │   ├── trainer/
│           │   │   ├── __init__.py
│           │   │   ├── test_data_collator.py
│           │   │   ├── test_trainer.py
│           │   │   ├── test_trainer_callback.py
│           │   │   ├── test_trainer_distributed.py
│           │   │   ├── test_trainer_seq2seq.py
│           │   │   ├── test_trainer_tpu.py
│           │   │   └── test_trainer_utils.py
│           │   └── utils/
│           │       ├── __init__.py
│           │       ├── test_activations.py
│           │       ├── test_activations_tf.py
│           │       ├── test_add_new_model_like.py
│           │       ├── test_cli.py
│           │       ├── test_convert_slow_tokenizer.py
│           │       ├── test_doc_samples.py
│           │       ├── test_file_utils.py
│           │       ├── test_generic.py
│           │       ├── test_hf_argparser.py
│           │       ├── test_hub_utils.py
│           │       ├── test_image_utils.py
│           │       ├── test_logging.py
│           │       ├── test_model_card.py
│           │       ├── test_model_output.py
│           │       ├── test_modeling_tf_core.py
│           │       ├── test_offline.py
│           │       ├── test_skip_decorators.py
│           │       └── test_versions_utils.py
│           └── utils/
│               ├── check_config_docstrings.py
│               ├── check_copies.py
│               ├── check_doc_toc.py
│               ├── check_dummies.py
│               ├── check_inits.py
│               ├── check_repo.py
│               ├── check_self_hosted_runner.py
│               ├── check_table.py
│               ├── check_tf_ops.py
│               ├── create_dummy_models.py
│               ├── custom_init_isort.py
│               ├── documentation_tests.txt
│               ├── download_glue_data.py
│               ├── get_ci_error_statistics.py
│               ├── get_github_job_time.py
│               ├── get_modified_files.py
│               ├── notification_service.py
│               ├── notification_service_doc_tests.py
│               ├── past_ci_versions.py
│               ├── prepare_for_doc_test.py
│               ├── print_env.py
│               ├── release.py
│               ├── sort_auto_mappings.py
│               ├── test_module/
│               │   ├── __init__.py
│               │   ├── custom_configuration.py
│               │   ├── custom_feature_extraction.py
│               │   ├── custom_modeling.py
│               │   ├── custom_pipeline.py
│               │   ├── custom_processing.py
│               │   ├── custom_tokenization.py
│               │   └── custom_tokenization_fast.py
│               ├── tests_fetcher.py
│               ├── tf_ops/
│               │   └── onnx.json
│               └── update_metadata.py
├── docs/
│   ├── disk_commands.txt
│   ├── gcp_setup.md
│   └── paper.md
├── experimental/
│   ├── cost_model.py
│   └── fit_cost_model.py
├── flexllmgen/
│   ├── __init__.py
│   ├── apps/
│   │   ├── README.md
│   │   ├── __init__.py
│   │   ├── completion.py
│   │   ├── data_wrangle/
│   │   │   ├── README.md
│   │   │   ├── data_wrangle_run.py
│   │   │   ├── install.sh
│   │   │   ├── test_batch_query_all_opt175b.sh
│   │   │   ├── test_batch_query_all_opt30b.sh
│   │   │   ├── test_batch_query_all_opt6.7b.sh
│   │   │   ├── test_batch_query_case.sh
│   │   │   ├── test_single_query_all_opt6.7b.sh
│   │   │   ├── test_single_query_case.sh
│   │   │   └── utils/
│   │   │       ├── constants.py
│   │   │       ├── data_utils.py
│   │   │       ├── prompt_utils.py
│   │   │       └── utils.py
│   │   ├── helm_fast_test.py
│   │   ├── helm_passed_30b.sh
│   │   └── helm_run.py
│   ├── compression.py
│   ├── dist_flex_opt.py
│   ├── dist_utils.py
│   ├── flex_opt.py
│   ├── opt_config.py
│   ├── profile_bandwidth.py
│   ├── profile_matmul.py
│   ├── pytorch_backend.py
│   ├── timer.py
│   └── utils.py
├── pyproject.toml
└── scripts/
    ├── mount_nvme_aws.sh
    ├── mount_nvme_gcp.sh
    ├── step_2_consolidate_992_shards_to_singleton.py
    ├── step_3_convert_to_numpy_weights.py
    ├── upload_pypi.sh
    └── utils.py

Download .txt

Showing preview only (3,651K chars total). Download the full file or copy to clipboard to get everything.

SYMBOL INDEX (42052 symbols across 1940 files)

FILE: benchmark/flexllmgen/bench_suite.py
  class Case (line 8) | class Case:

FILE: benchmark/hf_ds/bench_hf.py
  function run_huggingface (line 8) | def run_huggingface(model, prompt_len, gen_len, cut_gen_len, batch_size,
  function bench_one_case (line 38) | def bench_one_case(case):
  class Case (line 61) | class Case:

FILE: benchmark/hf_ds/hf_opt.py
  function get_filename (line 32) | def get_filename(model_name, batch_size, prompt_len, gen_len,
  function meta_to_cpu (line 51) | def meta_to_cpu(container, dtype=None):
  function realize_meta_module (line 62) | def realize_meta_module(module, dtype=None, device=None):
  function get_model_config (line 80) | def get_model_config(model_name):
  function get_ds_opt_model (line 94) | def get_ds_opt_model(model_name, dtype, cpu_offload, disk_offload, offlo...
  function get_hf_opt_model (line 155) | def get_hf_opt_model(model_name, dtype, cpu_offload, disk_offload, offlo...
  function run_generation (line 212) | def run_generation(model_name, batch_size, prompt_len, gen_len, cut_gen_...

FILE: benchmark/petals/run_opt_requests.py
  function _patch_bloom_config (line 11) | def _patch_bloom_config(bloom_config: BloomConfig, opt_config: OPTConfig):
  function client_process (line 18) | def client_process(
  function main (line 50) | def main():
  function run_bench (line 90) | def run_bench(args, sequence_length, max_tokens, config_bloom):

FILE: benchmark/third_party/DeepSpeed/benchmarks/communication/all_gather.py
  function timed_all_gather (line 8) | def timed_all_gather(input, output, args):
  function run_all_gather (line 62) | def run_all_gather(local_rank, args):

FILE: benchmark/third_party/DeepSpeed/benchmarks/communication/all_reduce.py
  function timed_all_reduce (line 7) | def timed_all_reduce(input, args):
  function run_all_reduce (line 41) | def run_all_reduce(local_rank, args):

FILE: benchmark/third_party/DeepSpeed/benchmarks/communication/all_to_all.py
  function timed_all_to_all (line 7) | def timed_all_to_all(input, output, args):
  function run_all_to_all (line 41) | def run_all_to_all(local_rank, args):

FILE: benchmark/third_party/DeepSpeed/benchmarks/communication/broadcast.py
  function timed_broadcast (line 8) | def timed_broadcast(input, args):
  function run_broadcast (line 42) | def run_broadcast(local_rank, args):

FILE: benchmark/third_party/DeepSpeed/benchmarks/communication/pt2pt.py
  function timed_pt2pt (line 7) | def timed_pt2pt(input, args):
  function run_pt2pt (line 60) | def run_pt2pt(local_rank, args):

FILE: benchmark/third_party/DeepSpeed/benchmarks/communication/run_all.py
  function main (line 11) | def main(args, rank):

FILE: benchmark/third_party/DeepSpeed/benchmarks/communication/utils.py
  function init_torch_distributed (line 10) | def init_torch_distributed(backend):
  function init_deepspeed_comm (line 18) | def init_deepspeed_comm(backend):
  function init_processes (line 27) | def init_processes(local_rank, args):
  function print_rank_0 (line 37) | def print_rank_0(message):
  function print_header (line 42) | def print_header(args, comm_op):
  function get_bw (line 58) | def get_bw(comm_op, size, duration, args):
  function get_metric_strings (line 86) | def get_metric_strings(args, tput, busbw, duration):
  function sync_all (line 101) | def sync_all():
  function max_numel (line 106) | def max_numel(comm_op, dtype, mem_factor, local_rank, args):
  function convert_size (line 131) | def convert_size(size_bytes):
  function _element_size (line 142) | def _element_size(dtype):
  function benchmark_parser (line 160) | def benchmark_parser():

FILE: benchmark/third_party/DeepSpeed/benchmarks/inference/bert-bench.py
  function print_latency (line 19) | def print_latency(latency_set, title, warmup=3):

FILE: benchmark/third_party/DeepSpeed/benchmarks/inference/collect_results.py
  function get_branch (line 32) | def get_branch(file_path):
  function get_benchmark_params (line 40) | def get_benchmark_params(root_dir, file_path):
  function get_perf_data (line 60) | def get_perf_data(file_content):
  function get_generated_text (line 68) | def get_generated_text(file_content, gen_text_n):
  function get_error (line 78) | def get_error(file_content):

FILE: benchmark/third_party/DeepSpeed/benchmarks/inference/gpt-bench.py
  function print_latency (line 35) | def print_latency(latency_set, title, warmup=3):

FILE: benchmark/third_party/DeepSpeed/csrc/adagrad/cpu_adagrad.cpp
  function create_adagrad_optimizer (line 98) | int create_adagrad_optimizer(int optimizer_id,
  function ds_adagrad_step (line 150) | int ds_adagrad_step(int optimizer_id,
  function ds_adagrad_step_plus_copy (line 177) | int ds_adagrad_step_plus_copy(int optimizer_id,
  function destroy_adagrad_optimizer (line 212) | int destroy_adagrad_optimizer(int optimizer_id)
  function PYBIND11_MODULE (line 219) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)

FILE: benchmark/third_party/DeepSpeed/csrc/adam/cpu_adam.cpp
  function create_adam_optimizer (line 124) | int create_adam_optimizer(int optimizer_id,
  function ds_adam_step (line 193) | int ds_adam_step(int optimizer_id,
  function ds_adam_step_plus_copy (line 235) | int ds_adam_step_plus_copy(int optimizer_id,
  function destroy_adam_optimizer (line 277) | int destroy_adam_optimizer(int optimizer_id)
  function PYBIND11_MODULE (line 284) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)

FILE: benchmark/third_party/DeepSpeed/csrc/adam/fused_adam_frontend.cpp
  function PYBIND11_MODULE (line 15) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)

FILE: benchmark/third_party/DeepSpeed/csrc/aio/common/deepspeed_aio_common.cpp
  function _report_aio_statistics (line 45) | static void _report_aio_statistics(const char* tag,
  function _get_aio_latencies (line 58) | static void _get_aio_latencies(std::vector<std::chrono::duration<double>...
  function _do_io_submit_singles (line 69) | static void _do_io_submit_singles(const long long int n_iocbs,
  function _do_io_submit_block (line 90) | static void _do_io_submit_block(const long long int n_iocbs,
  function _do_io_complete (line 110) | static int _do_io_complete(const long long int min_completes,
  function do_aio_operation_sequential (line 124) | void do_aio_operation_sequential(const bool read_op,
  function do_aio_operation_overlap (line 188) | void do_aio_operation_overlap(const bool read_op,
  function report_file_error (line 256) | void report_file_error(const char* filename, const std::string file_op, ...
  function open_file (line 263) | int open_file(const char* filename, const bool read_op)
  function regular_read (line 277) | int regular_read(const char* filename, std::vector<char>& buffer)
  function _validate_buffer (line 304) | static bool _validate_buffer(const char* filename, void* aio_buffer, con...
  function validate_aio_operation (line 317) | bool validate_aio_operation(const bool read_op,

FILE: benchmark/third_party/DeepSpeed/csrc/aio/common/deepspeed_aio_types.cpp
  type deepspeed_aio_latency_t (line 44) | struct deepspeed_aio_latency_t
  type iocb (line 63) | struct iocb
  type iocb (line 63) | struct iocb

FILE: benchmark/third_party/DeepSpeed/csrc/aio/common/deepspeed_aio_types.h
  type deepspeed_aio_latency_t (line 16) | struct deepspeed_aio_latency_t {
  type deepspeed_aio_perf_t (line 26) | struct deepspeed_aio_perf_t {
  type deepspeed_aio_config_t (line 33) | struct deepspeed_aio_config_t {
  type aio_context (line 48) | struct aio_context {

FILE: benchmark/third_party/DeepSpeed/csrc/aio/common/deepspeed_aio_utils.cpp
  type iocb (line 28) | struct iocb
  type iocb (line 68) | struct iocb
  function get_file_size (line 95) | int get_file_size(const char* filename, long long int& size)

FILE: benchmark/third_party/DeepSpeed/csrc/aio/common/deepspeed_aio_utils.h
  type io_xfer_ctxt (line 29) | struct io_xfer_ctxt {
  type io_prep_context (line 41) | struct io_prep_context {
  type io_prep_generator (line 58) | struct io_prep_generator {

FILE: benchmark/third_party/DeepSpeed/csrc/aio/py_lib/deepspeed_aio_thread.cpp
  type io_op_desc_t (line 49) | struct io_op_desc_t

FILE: benchmark/third_party/DeepSpeed/csrc/aio/py_lib/deepspeed_aio_thread.h
  type io_op_desc_t (line 13) | struct io_op_desc_t {
  type thread_sync_t (line 34) | struct thread_sync_t {
  type deepspeed_aio_thread_t (line 39) | struct deepspeed_aio_thread_t {

FILE: benchmark/third_party/DeepSpeed/csrc/aio/py_lib/deepspeed_py_aio.cpp
  function deepspeed_py_aio_write (line 38) | int deepspeed_py_aio_write(const torch::Tensor& buffer,
  function deepspeed_py_aio_read (line 77) | int deepspeed_py_aio_read(torch::Tensor& buffer,

FILE: benchmark/third_party/DeepSpeed/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
  function _start_aio_thread (line 13) | static void _start_aio_thread(std::shared_ptr<struct deepspeed_aio_threa...
  type io_op_desc_t (line 132) | struct io_op_desc_t
  type io_op_desc_t (line 144) | struct io_op_desc_t
  type io_op_desc_t (line 146) | struct io_op_desc_t

FILE: benchmark/third_party/DeepSpeed/csrc/aio/py_lib/deepspeed_py_aio_handle.h
  type deepspeed_aio_handle_t (line 12) | struct deepspeed_aio_handle_t {

FILE: benchmark/third_party/DeepSpeed/csrc/aio/py_lib/deepspeed_py_copy.cpp
  function helper_memcpy_1 (line 23) | static void helper_memcpy_1(float* dest, float* src, size_t param_size)
  function helper_memcpy_4 (line 52) | static void helper_memcpy_4(float* dest, float* src, size_t param_size)
  function helper_mempcy_8 (line 83) | static void helper_mempcy_8(float* dest, float* src, size_t param_size)
  function deepspeed_py_memcpy (line 122) | int deepspeed_py_memcpy(torch::Tensor& dest, const torch::Tensor& src)

FILE: benchmark/third_party/DeepSpeed/csrc/aio/py_lib/py_ds_aio.cpp
  function PYBIND11_MODULE (line 12) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)

FILE: benchmark/third_party/DeepSpeed/csrc/aio/py_test/aio_bench_generate_param.py
  function parse_arguments (line 14) | def parse_arguments():
  function validate_args (line 31) | def validate_args(args):
  function convert_to_param (line 41) | def convert_to_param(key):
  function generate_aio_param (line 52) | def generate_aio_param(read_log_dir, write_log_dir):
  function main (line 84) | def main():

FILE: benchmark/third_party/DeepSpeed/csrc/aio/py_test/aio_bench_perf_sweep.py
  class Job (line 35) | class Job(object):
    method __init__ (line 36) | def __init__(self, cmd_line, output_file=None, work_dir=None):
    method cmd (line 42) | def cmd(self):
    method get_stdout (line 45) | def get_stdout(self):
    method get_stderr (line 48) | def get_stderr(self):
    method get_cwd (line 51) | def get_cwd(self):
    method open_output_file (line 54) | def open_output_file(self):
    method close_output_file (line 58) | def close_output_file(self):
  class SweepConfig (line 64) | class SweepConfig(object):
    method __init__ (line 65) | def __init__(self, args):
  function parse_arguments (line 77) | def parse_arguments():
  function dump_cmd_lines (line 133) | def dump_cmd_lines(cmd_lines):
  function get_sweep_config_dict (line 139) | def get_sweep_config_dict(sweep_config_json):
  function get_sweep_cmd_lines (line 148) | def get_sweep_cmd_lines(sweep_config_dict):
  function run_job (line 168) | def run_job(job):
  function launch_sweep (line 182) | def launch_sweep(sweep_jobs, sync_job, flush_cache_job):
  function create_cmd_tags (line 193) | def create_cmd_tags(cmd_line):
  function get_log_file (line 204) | def get_log_file(io_op_desc, cmd_line):
  function create_perf_jobs (line 263) | def create_perf_jobs(io_op_desc, log_dir, cmd_lines):
  function script_path (line 275) | def script_path():
  function async_io_setup (line 279) | def async_io_setup():
  function get_block_size_and_count (line 284) | def get_block_size_and_count(io_bytes):
  function create_read_file (line 296) | def create_read_file(sweep_config):
  function remove_folder (line 314) | def remove_folder(folder):
  function run_read_sweep (line 319) | def run_read_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines):
  function run_write_sweep (line 340) | def run_write_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines):
  function main (line 363) | def main():

FILE: benchmark/third_party/DeepSpeed/csrc/aio/py_test/ds_aio_basic.py
  function pre_basic (line 16) | def pre_basic(args, tid, read_op):
  function pre_basic_read (line 37) | def pre_basic_read(pool_params):
  function pre_basic_write (line 43) | def pre_basic_write(pool_params):
  function post_basic (line 49) | def post_basic(pool_params):
  function main_basic_read (line 56) | def main_basic_read(pool_params):
  function main_basic_write (line 72) | def main_basic_write(pool_params):
  function get_schedule (line 88) | def get_schedule(args, read_op):
  function _aio_handle_tasklet (line 102) | def _aio_handle_tasklet(pool_params):
  function _init_tasklet (line 133) | def _init_tasklet(b):
  function aio_basic_multiprocessing (line 138) | def aio_basic_multiprocessing(args, read_op):

FILE: benchmark/third_party/DeepSpeed/csrc/aio/py_test/ds_aio_handle.py
  function pre_handle (line 16) | def pre_handle(args, tid, read_op):
  function pre_handle_read (line 49) | def pre_handle_read(pool_params):
  function pre_handle_write (line 55) | def pre_handle_write(pool_params):
  function post_handle (line 61) | def post_handle(pool_params):
  function main_parallel_read (line 68) | def main_parallel_read(pool_params):
  function main_parallel_write (line 82) | def main_parallel_write(pool_params):
  function main_handle_read (line 95) | def main_handle_read(pool_parms):
  function main_handle_write (line 108) | def main_handle_write(pool_parms):
  function get_schedule (line 120) | def get_schedule(args, read_op):
  function _aio_handle_tasklet (line 134) | def _aio_handle_tasklet(pool_params):
  function _init_tasklet (line 165) | def _init_tasklet(b):
  function aio_handle_multiprocessing (line 170) | def aio_handle_multiprocessing(args, read_op):

FILE: benchmark/third_party/DeepSpeed/csrc/aio/py_test/parse_aio_stats.py
  function parse_arguments (line 19) | def parse_arguments():
  function extract_value (line 38) | def extract_value(key, file):
  function get_file_key (line 62) | def get_file_key(file):
  function get_thread_count (line 69) | def get_thread_count(file):
  function get_metric (line 90) | def get_metric(file, metric):
  function validate_args (line 105) | def validate_args(args):
  function get_results (line 117) | def get_results(log_files, metric):
  function get_sorted_results (line 127) | def get_sorted_results(log_dir, metric):
  function main (line 140) | def main():

FILE: benchmark/third_party/DeepSpeed/csrc/aio/py_test/test_ds_aio.py
  function parse_arguments (line 16) | def parse_arguments():
  function validate_args (line 71) | def validate_args(args):
  function main (line 79) | def main():

FILE: benchmark/third_party/DeepSpeed/csrc/aio/py_test/test_ds_aio_utils.py
  function task_log (line 12) | def task_log(tid, msg):
  function task_barrier (line 17) | def task_barrier(barrier, num_parties):
  function report_results (line 23) | def report_results(args, read_op, pool_results):
  function refine_integer_value (line 43) | def refine_integer_value(value):
  function refine_args (line 52) | def refine_args(args):

FILE: benchmark/third_party/DeepSpeed/csrc/includes/StopWatch.h
  function class (line 10) | class Stopwatch {
  function class (line 45) | class Stopwatch {

FILE: benchmark/third_party/DeepSpeed/csrc/includes/Timer.h
  function class (line 9) | class GPUTimer {
  function class (line 32) | class CPUTimer {

FILE: benchmark/third_party/DeepSpeed/csrc/includes/context.h
  function DS_GET_BLOCKS (line 34) | inline int DS_GET_BLOCKS(const int N)
  function class (line 42) | class Context {

FILE: benchmark/third_party/DeepSpeed/csrc/includes/conversion_utils.h
  function namespace (line 16) | namespace conversion {

FILE: benchmark/third_party/DeepSpeed/csrc/includes/cpu_adagrad.h
  function class (line 22) | class Adagrad_Optimizer {
  function SynchronizeStreams (line 51) | inline void SynchronizeStreams()
  function IncrementStep (line 55) | inline void IncrementStep(size_t step)
  function update_state (line 60) | inline void update_state(float lr, float epsilon, float weight_decay)

FILE: benchmark/third_party/DeepSpeed/csrc/includes/cpu_adam.h
  function class (line 23) | class Adam_Optimizer {
  function SynchronizeStreams (line 67) | inline void SynchronizeStreams()
  function IncrementStep (line 71) | inline void IncrementStep(size_t step, float beta1, float beta2)
  function update_state (line 91) | inline void update_state(float lr, float epsilon, float weight_decay, bo...

FILE: benchmark/third_party/DeepSpeed/csrc/includes/dequantization_utils.h
  function namespace (line 14) | namespace dequantize {

FILE: benchmark/third_party/DeepSpeed/csrc/includes/dropout.h
  type Config (line 10) | struct Config {
  function ForwardWithBias (line 31) | void ForwardWithBias(int bsz, T* vals, const T* bias, cudaStream_t stream)
  function ForwardWithBias (line 36) | void ForwardWithBias(int bsz,
  function Backward (line 47) | void Backward(int bsz, T* d_vals, cudaStream_t stream)
  function Backward (line 52) | void Backward(int bsz, T* d_vals_out, const T* d_vals, cudaStream_t stream)
  function SetTrainingMode (line 60) | void SetTrainingMode(bool training) { _config.training = training; }
  function SetMask (line 62) | void SetMask(uint8_t* mask)
  function SetDimension (line 71) | inline void SetDimension(uint32_t dim) { _config.SetDim(dim); }

FILE: benchmark/third_party/DeepSpeed/csrc/includes/ds_transformer_cuda.h
  function GetNumHeads (line 17) | struct BertGemmAlgos {

FILE: benchmark/third_party/DeepSpeed/csrc/includes/gelu.h
  type Config (line 11) | struct Config {

FILE: benchmark/third_party/DeepSpeed/csrc/includes/memory_access_utils.h
  function namespace (line 11) | namespace mem_access {
  function namespace (line 469) | namespace internal {
  function memcpy_async (line 809) | void memcpy_async(void* shr, const void* gbl)
  function memcpy_async_nop (line 820) | void memcpy_async_nop(void* shr, const void* gbl, bool predicate)
  function memcpy_async_zero (line 836) | void memcpy_async_zero(void* shr, const void* gbl, bool predicate)
  function memcpy_async_zero_nop (line 848) | void memcpy_async_zero_nop(void* shr,
  function memcpy_async_cg (line 868) | void memcpy_async_cg(void* shr, const void* gbl)
  function memcpy_async_nop_cg (line 875) | void memcpy_async_nop_cg(void* shr, const void* gbl, bool predicate)
  function memcpy_async_zero_cg (line 889) | void memcpy_async_zero_cg(void* shr, const void* gbl, bool predicate)
  function memcpy_async_zero_nop_cg (line 899) | void memcpy_async_zero_nop_cg(void* shr,
  function memcpy_async_fence (line 917) | void memcpy_async_fence() { asm volatile("cp.async.commit_group;\n"); }
  function memcpy_async_wait (line 920) | void memcpy_async_wait()

FILE: benchmark/third_party/DeepSpeed/csrc/includes/normalize_layer.h
  type Config (line 14) | struct Config {
  function Backward (line 109) | void Backward(int bsz,
  function BackwardFusedAdd (line 157) | void BackwardFusedAdd(int bsz,
  function SetVar (line 185) | inline void SetVar(T* variance)
  function SetMean (line 191) | inline void SetMean(T* mean)

FILE: benchmark/third_party/DeepSpeed/csrc/includes/quantization.h
  function namespace (line 7) | namespace quantize {

FILE: benchmark/third_party/DeepSpeed/csrc/includes/quantization_utils.h
  function namespace (line 16) | namespace quantize {
  function DS_D_INLINE (line 238) | DS_D_INLINE GroupStats() { cur_max = reduce::init<rop::Max, __half2>(); }
  function DS_D_INLINE (line 245) | DS_D_INLINE void update(__half2 val)
  function DS_D_INLINE (line 284) | DS_D_INLINE GroupStats()
  function DS_D_INLINE (line 295) | DS_D_INLINE void update(__half2 val)
  function data_i8 (line 427) | auto data_i8 = PackedInt4{data_i8_2, data_i8_1};
  function _chunk (line 434) | void _chunk(int8_t* local_output, const __half2* data, Params<qType, num...
  function local_array (line 451) | void local_array(cg::thread_block& tb,

FILE: benchmark/third_party/DeepSpeed/csrc/includes/reduction_utils.h
  function namespace (line 13) | namespace reduce {
  function partitioned_block (line 545) | int num_threads>
  function partitioned_block (line 567) | int num_threads>

FILE: benchmark/third_party/DeepSpeed/csrc/includes/simd.h
  function simd_store (line 60) | void simd_store(float* dst, AVX_Data* src, bool half_precision)
  function simd_load (line 67) | void simd_load(AVX_Data* dst, float* src, bool half_precision)
  function simd_fma (line 74) | void simd_fma(AVX_Data* dst, AVX_Data* src_m_l, AVX_Data src_m_r, AVX_Da...
  function simd_fma (line 82) | void simd_fma(AVX_Data* dst, AVX_Data* src_m_l, AVX_Data src_m_r, AVX_Da...
  function simd_fma (line 90) | void simd_fma(AVX_Data* dst, AVX_Data* src_m_l, AVX_Data* src_m_r, AVX_D...
  function simd_sqrt (line 98) | void simd_sqrt(AVX_Data* dst, AVX_Data* src)
  function simd_add (line 104) | void simd_add(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data src_a_r)
  function simd_add (line 110) | void simd_add(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
  function simd_mul (line 116) | void simd_mul(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data src_a_r)
  function simd_mul (line 122) | void simd_mul(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
  function simd_div (line 128) | void simd_div(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)

FILE: benchmark/third_party/DeepSpeed/csrc/includes/softmax.h
  type Config (line 15) | struct Config {
  function SetSeqLength (line 56) | inline void SetSeqLength(size_t seq_len) { config_.seq_length = seq_len; }

FILE: benchmark/third_party/DeepSpeed/csrc/includes/strided_batch_gemm.h
  type Config (line 11) | struct Config {
  function ForwardPlusSave (line 82) | void ForwardPlusSave(T* output, const T* _buffer_a, const T* _buffer_b, ...
  function T (line 185) | inline const T* GetBufferA() const { return k_buf; }
  function T (line 187) | inline const T* GetBufferB() const { return q_buf; }
  function SetConfig (line 189) | inline void SetConfig(int m, int n, int k) { _config.SetConfig(m, n, k); }

FILE: benchmark/third_party/DeepSpeed/csrc/lamb/fused_lamb_cuda.cpp
  function lamb (line 32) | at::Tensor lamb(at::Tensor& p,
  function PYBIND11_MODULE (line 106) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)

FILE: benchmark/third_party/DeepSpeed/csrc/quantization/pt_binding.cpp
  function ds_quantize (line 8) | at::Tensor ds_quantize(at::Tensor& vals, int groups, int bits)
  function ds_sr_quantize (line 22) | at::Tensor ds_sr_quantize(at::Tensor& vals, int groups, int bits)
  function ds_quantize_asym (line 36) | at::Tensor ds_quantize_asym(at::Tensor& vals, int groups, int bits)
  function ds_sr_quantize_asym (line 50) | at::Tensor ds_sr_quantize_asym(at::Tensor& vals, int groups, int bits)
  function quantize_kernel (line 73) | std::vector<at::Tensor> quantize_kernel(at::Tensor& input_vals,
  function dequantize (line 116) | at::Tensor dequantize(at::Tensor& quantized_data,
  function PYBIND11_MODULE (line 147) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)

FILE: benchmark/third_party/DeepSpeed/csrc/sparse_attention/utils.cpp
  function segment_blocks (line 14) | void segment_blocks(torch::Tensor layout,
  function ret_t (line 90) | ret_t sdd_segment(torch::Tensor layout, int start_width)
  function PYBIND11_MODULE (line 117) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)

FILE: benchmark/third_party/DeepSpeed/csrc/spatial/csrc/pt_binding.cpp
  function ChannelsLastProblem (line 11) | ChannelsLastProblem dimension_problem(at::Tensor& input)
  function seq_unroll_bias_add (line 34) | at::Tensor seq_unroll_bias_add(at::Tensor& input, at::Tensor& bias)
  function seq_bias_add_add (line 57) | at::Tensor seq_bias_add_add(at::Tensor& input, at::Tensor& bias, at::Ten...
  function seq_bias_add_bias_add (line 80) | at::Tensor seq_bias_add_bias_add(at::Tensor& input,
  function PYBIND11_MODULE (line 106) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)

FILE: benchmark/third_party/DeepSpeed/csrc/transformer/ds_transformer_cuda.cpp
  function get_workspace_size (line 22) | unsigned get_workspace_size(unsigned maxBatchSize,
  function create_transformer_layer (line 591) | int create_transformer_layer(unsigned layer_id,
  function ds_transformer_forward (line 638) | std::vector<torch::Tensor> ds_transformer_forward(unsigned layer_id,
  function ds_transformer_backward (line 823) | std::vector<torch::Tensor> ds_transformer_backward(unsigned layer_id,
  function PYBIND11_MODULE (line 1029) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)

FILE: benchmark/third_party/DeepSpeed/csrc/transformer/inference/csrc/pt_binding.cpp
  type ActivationFuncType (line 18) | enum class ActivationFuncType { UNKNOWN = 0, GELU = 1, ReLU = 2 }
  type TransformerType (line 20) | enum class TransformerType : uint8_t { UNKNOWN = 0, GPTType = 1, BERTTyp...
  function infer_transformer_type (line 24) | inline auto infer_transformer_type(at::Tensor& attn_mask) -> Transformer...
  function get_attn_mask_stride (line 38) | inline auto get_attn_mask_stride(at::Tensor& attn_mask) -> int
  function ds_softmax (line 56) | at::Tensor ds_softmax(at::Tensor& attn_scores,
  function allocate_workspace (line 104) | void allocate_workspace(unsigned hidden_dim,
  function einsum_sec_sm_ecm (line 127) | at::Tensor einsum_sec_sm_ecm(at::Tensor& Q, at::Tensor& W)
  function attention_unfused (line 170) | void attention_unfused(at::Tensor& prev_key_cont,
  function ds_softmax_context1 (line 258) | std::vector<at::Tensor> ds_softmax_context1(at::Tensor& query,
  function ds_softmax_internal (line 311) | void ds_softmax_internal(T* attn_scores,
  function attention_unfused (line 345) | void attention_unfused(T* prev_key_cont,
  function ds_softmax_context (line 425) | std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value,
  function ds_bias_gelu (line 533) | at::Tensor ds_bias_gelu(at::Tensor& input, at::Tensor& bias)
  function ds_bias_geglu (line 548) | at::Tensor ds_bias_geglu(at::Tensor& activation, at::Tensor& bias)
  function ds_bias_relu (line 584) | at::Tensor ds_bias_relu(at::Tensor& input, at::Tensor& bias)
  function ds_bias_add (line 600) | at::Tensor ds_bias_add(at::Tensor& input, at::Tensor& bias)
  function ds_bias_residual (line 616) | at::Tensor ds_bias_residual(at::Tensor& input, at::Tensor& residual, at:...
  function ds_layer_norm (line 632) | at::Tensor ds_layer_norm(at::Tensor& input, at::Tensor& gamma, at::Tenso...
  function ds_layer_norm_internal (line 662) | void ds_layer_norm_internal(T* workspace,
  function ds_layer_norm_residual (line 680) | at::Tensor ds_layer_norm_residual(at::Tensor& input,
  function ds_layer_norm_residual_store (line 719) | std::vector<at::Tensor> ds_layer_norm_residual_store(at::Tensor& input,
  function quantized_gemm (line 761) | void quantized_gemm(void* output,
  function qkv_unfused_cublas (line 807) | at::Tensor qkv_unfused_cublas(at::Tensor& output,
  function ds_qkv_gemm (line 859) | std::vector<at::Tensor> ds_qkv_gemm(at::Tensor& input,
  function quantized_gemm (line 891) | void quantized_gemm(at::Tensor& output,
  function ds_qkv_gemm_int8 (line 936) | at::Tensor ds_qkv_gemm_int8(at::Tensor& input,
  function ds_linear_layer (line 970) | at::Tensor ds_linear_layer(at::Tensor& input,
  function add_padding (line 1083) | std::vector<at::Tensor> add_padding(at::Tensor& query, at::Tensor& key, ...
  function padd_add_transform (line 1125) | std::vector<at::Tensor> padd_add_transform(at::Tensor& query,
  function ds_linear_layer_int8 (line 1176) | at::Tensor ds_linear_layer_int8(at::Tensor& input,
  function ds_vector_matmul (line 1202) | at::Tensor ds_vector_matmul(at::Tensor& input,
  function ds_vector_matmul_int8 (line 1252) | at::Tensor ds_vector_matmul_int8(at::Tensor& input,
  function mlp_unfused_cublas (line 1272) | at::Tensor mlp_unfused_cublas(at::Tensor& output,
  function ds_mlp_gemm (line 1382) | std::vector<at::Tensor> ds_mlp_gemm(at::Tensor& input,
  function ds_mlp_gemm_int8 (line 1432) | std::vector<at::Tensor> ds_mlp_gemm_int8(at::Tensor& input,
  function fused_gemm_gelu (line 1468) | at::Tensor fused_gemm_gelu(at::Tensor& input,
  function apply_rotary_pos_emb (line 1602) | std::vector<at::Tensor> apply_rotary_pos_emb(at::Tensor& mixed_query,
  function fused_gemm_gelu_int8 (line 1647) | at::Tensor fused_gemm_gelu_int8(at::Tensor& input,
  function moe_res_matmul (line 1676) | at::Tensor moe_res_matmul(at::Tensor& moe_res, at::Tensor& coef, at::Ten...
  function PYBIND11_MODULE (line 1699) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)

FILE: benchmark/third_party/DeepSpeed/csrc/transformer/inference/includes/inference_context.h
  function DS_GET_BLOCKS (line 40) | inline int DS_GET_BLOCKS(const int N)
  function class (line 48) | class Context {
  function virtual (line 73) | virtual ~Context()
  function Context (line 83) | static Context& Instance()
  function GenWorkSpace (line 89) | void GenWorkSpace(const unsigned& num_layers,
  function cudaEvent_t (line 163) | cudaEvent_t GetCompEvent(int id) { return id == 1 ? _comp1_event : _comp...
  function new_token (line 172) | inline unsigned new_token(unsigned layer_id)
  function advance_tokens (line 185) | inline void advance_tokens() { _num_tokens++; }
  function cublasHandle_t (line 205) | cublasHandle_t GetCublasHandle() { return _cublasHandle; }
  function SetSeed (line 214) | void SetSeed(uint64_t new_seed) { _seed = new_seed; }
  function SynchComp (line 218) | inline void SynchComp()
  function SynchComm (line 223) | inline void SynchComm()

FILE: benchmark/third_party/DeepSpeed/csrc/utils/flatten_unflatten.cpp
  function flatten (line 11) | at::Tensor flatten(std::vector<at::Tensor> tensors)
  function unflatten (line 16) | std::vector<at::Tensor> unflatten(at::Tensor flat, std::vector<at::Tenso...
  function PYBIND11_MODULE (line 21) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)

FILE: benchmark/third_party/DeepSpeed/deepspeed/__init__.py
  function _parse_version (line 39) | def _parse_version(version_str):
  function initialize (line 52) | def initialize(args=None,
  function _add_core_arguments (line 159) | def _add_core_arguments(parser):
  function add_config_arguments (line 210) | def add_config_arguments(parser):
  function default_inference_config (line 226) | def default_inference_config():
  function init_inference (line 233) | def init_inference(model, config=None, **kwargs):

FILE: benchmark/third_party/DeepSpeed/deepspeed/accelerator/abstract_accelerator.py
  class DeepSpeedAccelerator (line 5) | class DeepSpeedAccelerator(ABC):
    method __init__ (line 6) | def __init__(self):
    method device_name (line 12) | def device_name(self, device_index):
    method device (line 16) | def device(self, device_index):
    method set_device (line 20) | def set_device(self, device_index):
    method current_device (line 24) | def current_device(self):
    method current_device_name (line 28) | def current_device_name(self):
    method device_count (line 32) | def device_count(self):
    method synchronize (line 36) | def synchronize(self, device_index=None):
    method random (line 41) | def random(self):
    method set_rng_state (line 45) | def set_rng_state(self, new_state, device_index=None):
    method get_rng_state (line 49) | def get_rng_state(self, device_index=None):
    method manual_seed (line 53) | def manual_seed(self, seed):
    method manual_seed_all (line 57) | def manual_seed_all(self, seed):
    method initial_seed (line 61) | def initial_seed(self, seed):
    method default_generator (line 65) | def default_generator(self, device_index):
    method Stream (line 70) | def Stream(self, device=None, priority=0, **kwargs):
    method StreamContext (line 74) | def StreamContext(self, stream):
    method stream (line 78) | def stream(self, stream):
    method current_stream (line 82) | def current_stream(self, device_index=None):
    method default_stream (line 86) | def default_stream(self, device_index=None):
    method Event (line 90) | def Event(self, **kwargs):
    method empty_cache (line 95) | def empty_cache(self):
    method memory_allocated (line 99) | def memory_allocated(self, device_index=None):
    method max_memory_allocated (line 103) | def max_memory_allocated(self, device_index=None):
    method reset_max_memory_allocated (line 107) | def reset_max_memory_allocated(self, device_index=None):
    method memory_cached (line 111) | def memory_cached(self, device_index=None):
    method max_memory_cached (line 115) | def max_memory_cached(self, device_index=None):
    method reset_max_memory_cached (line 119) | def reset_max_memory_cached(self, device_index=None):
    method memory_stats (line 123) | def memory_stats(self, device_index=None):
    method reset_peak_memory_stats (line 127) | def reset_peak_memory_stats(self, device_index=None):
    method memory_reserved (line 131) | def memory_reserved(self, device_index=None):
    method max_memory_reserved (line 135) | def max_memory_reserved(self, device_index=None):
    method total_memory (line 139) | def total_memory(self, device_index=None):
    method is_bf16_supported (line 144) | def is_bf16_supported(self):
    method is_fp16_supported (line 148) | def is_fp16_supported(self):
    method amp (line 153) | def amp(self):
    method is_available (line 157) | def is_available(self):
    method range_push (line 161) | def range_push(self, msg):
    method range_pop (line 165) | def range_pop(self):
    method lazy_call (line 169) | def lazy_call(self, callback):
    method communication_backend_name (line 173) | def communication_backend_name(self):
    method BFloat16Tensor (line 179) | def BFloat16Tensor(self):
    method ByteTensor (line 184) | def ByteTensor(self):
    method DoubleTensor (line 189) | def DoubleTensor(self):
    method FloatTensor (line 194) | def FloatTensor(self):
    method HalfTensor (line 199) | def HalfTensor(self):
    method IntTensor (line 204) | def IntTensor(self):
    method LongTensor (line 209) | def LongTensor(self):
    method pin_memory (line 213) | def pin_memory(self, tensor):
    method on_accelerator (line 217) | def on_accelerator(self, tensor):
    method op_builder_dir (line 221) | def op_builder_dir(self):
    method create_op_builder (line 225) | def create_op_builder(self, class_name):
    method build_extension (line 229) | def build_extension(self):

FILE: benchmark/third_party/DeepSpeed/deepspeed/accelerator/cuda_accelerator.py
  class CUDA_Accelerator (line 5) | class CUDA_Accelerator(DeepSpeedAccelerator):
    method __init__ (line 6) | def __init__(self):
    method device_name (line 11) | def device_name(self, device_index=None):
    method device (line 16) | def device(self, device_index=None):
    method set_device (line 19) | def set_device(self, device_index):
    method current_device (line 22) | def current_device(self):
    method current_device_name (line 25) | def current_device_name(self):
    method device_count (line 28) | def device_count(self):
    method synchronize (line 31) | def synchronize(self, device_index=None):
    method random (line 35) | def random(self):
    method set_rng_state (line 38) | def set_rng_state(self, new_state, device_index=None):
    method get_rng_state (line 44) | def get_rng_state(self, device_index=None):
    method manual_seed (line 50) | def manual_seed(self, seed):
    method manual_seed_all (line 53) | def manual_seed_all(self, seed):
    method initial_seed (line 56) | def initial_seed(self, seed):
    method default_generator (line 59) | def default_generator(self, device_index):
    method Stream (line 63) | def Stream(self, device=None, priority=0, **kwargs):
    method StreamContext (line 66) | def StreamContext(self, stream):
    method stream (line 69) | def stream(self, stream):
    method current_stream (line 72) | def current_stream(self, device_index=None):
    method default_stream (line 75) | def default_stream(self, device_index=None):
    method Event (line 78) | def Event(self, **kwargs):
    method empty_cache (line 82) | def empty_cache(self):
    method memory_allocated (line 85) | def memory_allocated(self, device_index=None):
    method max_memory_allocated (line 88) | def max_memory_allocated(self, device_index=None):
    method reset_max_memory_allocated (line 91) | def reset_max_memory_allocated(self, device_index=None):
    method memory_cached (line 94) | def memory_cached(self, device_index=None):
    method max_memory_cached (line 97) | def max_memory_cached(self, device_index=None):
    method reset_max_memory_cached (line 100) | def reset_max_memory_cached(self, device_index=None):
    method memory_stats (line 103) | def memory_stats(self, device_index=None):
    method reset_peak_memory_stats (line 107) | def reset_peak_memory_stats(self, device_index=None):
    method memory_reserved (line 111) | def memory_reserved(self, device_index=None):
    method max_memory_reserved (line 115) | def max_memory_reserved(self, device_index=None):
    method total_memory (line 119) | def total_memory(self, device_index=None):
    method is_bf16_supported (line 123) | def is_bf16_supported(self):
    method is_fp16_supported (line 126) | def is_fp16_supported(self):
    method amp (line 134) | def amp(self):
    method is_available (line 139) | def is_available(self):
    method range_push (line 142) | def range_push(self, msg):
    method range_pop (line 146) | def range_pop(self):
    method lazy_call (line 150) | def lazy_call(self, callback):
    method communication_backend_name (line 153) | def communication_backend_name(self):
    method BFloat16Tensor (line 159) | def BFloat16Tensor(self):
    method ByteTensor (line 163) | def ByteTensor(self):
    method DoubleTensor (line 167) | def DoubleTensor(self):
    method FloatTensor (line 171) | def FloatTensor(self):
    method HalfTensor (line 175) | def HalfTensor(self):
    method IntTensor (line 179) | def IntTensor(self):
    method LongTensor (line 183) | def LongTensor(self):
    method pin_memory (line 186) | def pin_memory(self, tensor):
    method on_accelerator (line 189) | def on_accelerator(self, tensor):
    method op_builder_dir (line 196) | def op_builder_dir(self):
    method create_op_builder (line 199) | def create_op_builder(self, class_name):
    method build_extension (line 238) | def build_extension(self):

FILE: benchmark/third_party/DeepSpeed/deepspeed/accelerator/real_accelerator.py
  function _validate_accelerator (line 6) | def _validate_accelerator(accel_obj):
  function get_accelerator (line 15) | def get_accelerator():
  function set_accelerator (line 33) | def set_accelerator(accel_obj):

FILE: benchmark/third_party/DeepSpeed/deepspeed/autotuning/autotuner.py
  class Autotuner (line 31) | class Autotuner:
    method __init__ (line 35) | def __init__(self, args, active_resources):
    method print_tuning_results (line 93) | def print_tuning_results(self):
    method _get_user_config (line 153) | def _get_user_config(self, user_args):
    method _get_resource_manager (line 188) | def _get_resource_manager(self, active_resources):
    method _get_exp_resources (line 217) | def _get_exp_resources(self, args):
    method metric (line 238) | def metric(self):
    method fast_enabled (line 241) | def fast_enabled(self):
    method max_train_batch_size (line 244) | def max_train_batch_size(self):
    method mp_size (line 247) | def mp_size(self):
    method max_train_micro_batch_size_per_gpu (line 250) | def max_train_micro_batch_size_per_gpu(self):
    method min_train_micro_batch_size_per_gpu (line 261) | def min_train_micro_batch_size_per_gpu(self):
    method num_tuning_micro_batch_sizes (line 264) | def num_tuning_micro_batch_sizes(self):
    method fp16_enabled (line 267) | def fp16_enabled(self):
    method get_gpu_memory_info (line 273) | def get_gpu_memory_info(self):
    method get_activation_memory_per_gpu (line 276) | def get_activation_memory_per_gpu(self):
    method get_instantiation_memory_required_per_gpu (line 280) | def get_instantiation_memory_required_per_gpu(self, zero_stage):
    method _generate_experiments (line 306) | def _generate_experiments(self, tuning_space, max_train_batch_size_per...
    method tune (line 413) | def tune(self):
    method tune_space (line 521) | def tune_space(self,
    method get_plauteu_mbs (line 657) | def get_plauteu_mbs(self, tuning_space_name):
    method get_model_num_params (line 679) | def get_model_num_params(self):
    method model_info_profile_run (line 683) | def model_info_profile_run(self):
    method update_records (line 736) | def update_records(self, space_name, exp, metric_val, num_exps):
    method get_best_space_record (line 742) | def get_best_space_record(self, space_name):
    method get_best_space_records (line 756) | def get_best_space_records(self):
    method run_tuning_micro_batch_sizes (line 769) | def run_tuning_micro_batch_sizes(self,
    method get_min_max_micro_batch_size (line 868) | def get_min_max_micro_batch_size(self,
    method get_gas_from_user_config (line 1016) | def get_gas_from_user_config(self):
    method get_val_from_user_args (line 1033) | def get_val_from_user_args(self, ds_name):
    method get_tuning_micro_batch_size_list (line 1044) | def get_tuning_micro_batch_size_list(self,
    method run_ds_config (line 1092) | def run_ds_config(self, ds_config, exp_name):
    method write_optimal_config (line 1113) | def write_optimal_config(self):
    method run_after_tuning (line 1141) | def run_after_tuning(self):

FILE: benchmark/third_party/DeepSpeed/deepspeed/autotuning/config.py
  class DeepSpeedAutotuningConfig (line 10) | class DeepSpeedAutotuningConfig(DeepSpeedConfigObject):
    method __init__ (line 11) | def __init__(self, param_dict):
    method _initialize (line 32) | def _initialize(self, autotuning_dict):
  function get_model_info_config (line 122) | def get_model_info_config(param_dict):
  function get_default_model_info_config (line 133) | def get_default_model_info_config():

FILE: benchmark/third_party/DeepSpeed/deepspeed/autotuning/scheduler.py
  class ResourceManager (line 28) | class ResourceManager:
    method __init__ (line 29) | def __init__(self,
    method schedule_experiments (line 59) | def schedule_experiments(self, exp_paths):
    method run_job (line 96) | def run_job(self, exp: dict, reservations):
    method experiment_check (line 122) | def experiment_check(self, pbar):
    method resource_request (line 143) | def resource_request(self, exp):
    method status (line 163) | def status(self):
    method run (line 169) | def run(self):
    method save_exp_results_to_database (line 197) | def save_exp_results_to_database(self, message, ranks=None, path=None):
    method parse_results (line 222) | def parse_results(self, metric):
    method clear (line 257) | def clear(self):
  class Node (line 270) | class Node:
    method __init__ (line 271) | def __init__(self, host, max_slots):
    method reserve_slots (line 276) | def reserve_slots(self, slot_request: int) -> list:
    method restore_slots (line 280) | def restore_slots(self, slots: list):
  class Reservation (line 284) | class Reservation:
    method __init__ (line 285) | def __init__(self, node, slots):
    method restore_slots (line 289) | def restore_slots(self):
    method desc (line 292) | def desc(self):
  function get_job_id (line 297) | def get_job_id():
  function get_user (line 310) | def get_user():
  function run_experiment (line 319) | def run_experiment(exp: dict, reservations, user_script, user_args):
  function clean_up (line 410) | def clean_up(exp: dict, reservations):

FILE: benchmark/third_party/DeepSpeed/deepspeed/autotuning/tuner/base_tuner.py
  class BaseTuner (line 8) | class BaseTuner:
    method __init__ (line 9) | def __init__(self, exps, resource_manager, metric):
    method has_next (line 18) | def has_next(self):
    method next_batch (line 25) | def next_batch(self, sample_size):
    method update (line 29) | def update(self):
    method tune (line 32) | def tune(self, sample_size=1, n_trials=1000, early_stopping=None):

FILE: benchmark/third_party/DeepSpeed/deepspeed/autotuning/tuner/cost_model.py
  class XGBoostCostModel (line 9) | class XGBoostCostModel():
    method __init__ (line 10) | def __init__(self, loss_type, num_threads=None, log_interval=25, upper...
    method fit (line 45) | def fit(self, xs, ys):
    method predict (line 56) | def predict(self, xs):

FILE: benchmark/third_party/DeepSpeed/deepspeed/autotuning/tuner/index_based_tuner.py
  class RandomTuner (line 6) | class RandomTuner(BaseTuner):
    method __init__ (line 8) | def __init__(self, exps: list, resource_manager, metric):
    method next_batch (line 11) | def next_batch(self, sample_size=1):
  class GridSearchTuner (line 21) | class GridSearchTuner(BaseTuner):
    method __init__ (line 23) | def __init__(self, exps: list, resource_manager, metric):
    method next_batch (line 26) | def next_batch(self, sample_size=1):

FILE: benchmark/third_party/DeepSpeed/deepspeed/autotuning/tuner/model_based_tuner.py
  class ModelBasedTuner (line 14) | class ModelBasedTuner(BaseTuner):
    method __init__ (line 16) | def __init__(self, exps: list, resource_manager, metric, tuning_sapce):
    method find_estimated_top_configs (line 53) | def find_estimated_top_configs(self):
    method next_batch (line 80) | def next_batch(self, sample_size):
    method has_next (line 113) | def has_next(self):
    method update (line 116) | def update(self):

FILE: benchmark/third_party/DeepSpeed/deepspeed/autotuning/tuner/utils.py
  function index_to_feature (line 7) | def index_to_feature(p, dims):
  function feature_to_index (line 16) | def feature_to_index(feature, dims):
  function dict_to_dims (line 25) | def dict_to_dims(tuning_space):
  function gen_combinations (line 40) | def gen_combinations(d: dict):
  function flatten (line 52) | def flatten(d, parent_key='', sep='_'):
  function dict_to_feature (line 63) | def dict_to_feature(feature_dict, keys, max_value=None):

FILE: benchmark/third_party/DeepSpeed/deepspeed/autotuning/utils.py
  function search_error (line 12) | def search_error(filename):
  function was_interruptted (line 24) | def was_interruptted(filename):
  function find_replace_str (line 36) | def find_replace_str(value, replace_dict):
  function find_replace (line 54) | def find_replace(target, replace_dict):
  function get_list (line 69) | def get_list(val):
  function combine_dict (line 76) | def combine_dict(d, u):
  function del_if_exists (line 90) | def del_if_exists(t, d):
  function replace_dict (line 105) | def replace_dict(d, u, ignored_keys=[]):
  function get_val_by_key (line 128) | def get_val_by_key(d: dict, k):
  function set_val_by_key (line 137) | def set_val_by_key(d: dict, k, vv):
  function fetch_hostfile (line 145) | def fetch_hostfile(hostfile_path):
  function validate_ds_config (line 176) | def validate_ds_config(config: dict):
  function remove_dupe_dicts (line 210) | def remove_dupe_dicts(l):
  function prune_config (line 224) | def prune_config(config, ignored_keys=[]):
  function prune_configs (line 248) | def prune_configs(configs, ignored_keys=[]):
  function get_tuning_keys (line 266) | def get_tuning_keys(tuning_space: dict):
  function get_all_configs (line 284) | def get_all_configs(tuning_space: dict, ignore_keys=None):
  function canonical_name (line 312) | def canonical_name(config: dict, tuning_keys=None, prefix="", omit_val=F...
  function get_first_config (line 384) | def get_first_config(config: dict):
  function write_experiments (line 400) | def write_experiments(exps: list, exps_dir: str):
  function memory_to_string (line 413) | def memory_to_string(n, postfix="", units=None, precision=2):
  function number_to_string (line 438) | def number_to_string(n, postfix="", units=None, precision=2):

FILE: benchmark/third_party/DeepSpeed/deepspeed/checkpoint/deepspeed_checkpoint.py
  class DeepSpeedCheckpoint (line 37) | class DeepSpeedCheckpoint(object):
    method __init__ (line 38) | def __init__(self, dir, tp_degree=None, pp_degree=None, dp_degree=None):
    method is_change_tp_degree (line 89) | def is_change_tp_degree(self):
    method is_change_pp_degree (line 92) | def is_change_pp_degree(self):
    method is_change_dp_degree (line 95) | def is_change_dp_degree(self):
    method show_2d_mapping (line 98) | def show_2d_mapping(self):
    method show_tp_embedding_map (line 108) | def show_tp_embedding_map(self):
    method show_tp_final_norm_map (line 111) | def show_tp_final_norm_map(self):
    method show_pp_tranformer_map (line 114) | def show_pp_tranformer_map(self):
    method show_transformer_file_map (line 117) | def show_transformer_file_map(self):
    method _build_global_state (line 120) | def _build_global_state(self):
    method get_zero_checkpoint_state (line 125) | def get_zero_checkpoint_state(self, pp_index, tp_index, dp_index) -> d...
    method get_zero_files (line 131) | def get_zero_files(self, pp_index, tp_index, dp_index) -> list:
    method get_embedding_layer_id (line 136) | def get_embedding_layer_id(self):
    method get_final_norm_layer_id (line 139) | def get_final_norm_layer_id(self):
    method get_iteration (line 142) | def get_iteration(self):
    method get_embedding_state (line 149) | def get_embedding_state(self, tp_index: int) -> Dict:
    method get_embedding_files (line 159) | def get_embedding_files(self, tp_index: int) -> list:
    method _get_checkpoint_value (line 163) | def _get_checkpoint_value(self, key):
    method get_args (line 170) | def get_args(self):
    method get_checkpoint_info (line 173) | def get_checkpoint_info(self, info_key=CHECKPOINT_INFO_KEY):
    method get_2d_parallel_state (line 176) | def get_2d_parallel_state(self, tp_index: int, pp_index: int) -> dict:
    method get_transformer_state (line 194) | def get_transformer_state(self, tp_index: int, pp_index: int) -> list:
    method get_pp_transformer_map (line 207) | def get_pp_transformer_map(self, pp_index: int) -> list:
    method get_final_norm_state (line 211) | def get_final_norm_state(self, tp_index: int) -> Dict:
    method get_final_norm_files (line 217) | def get_final_norm_files(self, tp_index: int) -> list:
    method _build_tp_other_layer_map (line 221) | def _build_tp_other_layer_map(self, layer_index: int):
    method get_2d_parallel_files (line 229) | def get_2d_parallel_files(self, tp_index: int, pp_index: int) -> list:
    method _build_pp_transformer_map (line 235) | def _build_pp_transformer_map(self):
    method _dump_mapping (line 246) | def _dump_mapping(self, data_map, map_tag=None):
    method _build_transformer_file_map (line 252) | def _build_transformer_file_map(self):
    method _sanity_check (line 272) | def _sanity_check(self):
    method validate_files (line 280) | def validate_files(self):
    method _get_layer_keys (line 285) | def _get_layer_keys(self):
    method _merge_state_dicts (line 293) | def _merge_state_dicts(self, sd_list):
    method _validate_folder (line 304) | def _validate_folder(self, dir):

FILE: benchmark/third_party/DeepSpeed/deepspeed/checkpoint/reshape_3d_utils.py
  class model_3d_desc (line 15) | class model_3d_desc(object):
    method __init__ (line 16) | def __init__(self, pp_degree=1, tp_degree=1, dp_degree=1):
    method reshape (line 21) | def reshape(self, target_3d_desc, verbose=False):
    method get_desc (line 37) | def get_desc(self):
    method world_size (line 40) | def world_size(self):
    method is_valid (line 43) | def is_valid(self, pp_index, tp_index, dp_index):
    method can_reshape (line 57) | def can_reshape(self, target_3d_desc):
  function get_model_3d_descriptor (line 77) | def get_model_3d_descriptor(dir):
  function flatten_dp_dimension (line 93) | def flatten_dp_dimension(meg_2d_map, src_2d_size, dp_degree):
  function unflatten_dp_dimension (line 104) | def unflatten_dp_dimension(meg_2d_map, dp_degree):

FILE: benchmark/third_party/DeepSpeed/deepspeed/checkpoint/reshape_meg_2d.py
  class meg_2d_parallel_map (line 4) | class meg_2d_parallel_map(object):
    method __init__ (line 5) | def __init__(self, pp_degree, tp_degree):
    method simple_init (line 10) | def simple_init(self):
    method add_data (line 17) | def add_data(self, pp_index, tp_index, data):
    method get_data (line 26) | def get_data(self, pp_index=None, tp_index=None):
    method print_data (line 38) | def print_data(self, tag):
    method _validate_indices (line 43) | def _validate_indices(self, pp_index, tp_index):
    method _make_key (line 47) | def _make_key(self, i, j):
  function _reshape_tp_dimension (line 51) | def _reshape_tp_dimension(old_2d_map, new_tp_degree):
  function _reshape_pp_dimension (line 63) | def _reshape_pp_dimension(old_2d_map, new_pp_degree):
  function reshape_meg_2d_parallel (line 75) | def reshape_meg_2d_parallel(old_pp_degree,
  function get_mpu_ranks (line 106) | def get_mpu_ranks(tp_size=1, pp_size=1, dp_size=1, virtual_pp_size=None):
  function reshape (line 186) | def reshape(src, tgt):

FILE: benchmark/third_party/DeepSpeed/deepspeed/checkpoint/reshape_utils.py
  function basic_folder_validation (line 7) | def basic_folder_validation(dir):
  function get_files_with_prefix (line 12) | def get_files_with_prefix(all_files, prefix):
  function validate_files (line 22) | def validate_files(file_list):
  function get_files (line 28) | def get_files(dir):
  function get_zero_files (line 36) | def get_zero_files(dir):
  function partition_data (line 46) | def partition_data(data_list, num_partitions):
  function _key_list_to_string (line 58) | def _key_list_to_string(key_list):
  function merge_state_dict (line 62) | def merge_state_dict(dict_a, dict_b, key_list):
  function merge_state_list (line 74) | def merge_state_list(list_a, list_b, key_list):
  function merge_state (line 84) | def merge_state(state_a, state_b, key_list=[]):

FILE: benchmark/third_party/DeepSpeed/deepspeed/checkpoint/universal_checkpoint.py
  function load_hp_checkpoint_state (line 13) | def load_hp_checkpoint_state(self, folder, tp_rank, tp_world_size):
  function enable_universal_checkpoint (line 105) | def enable_universal_checkpoint(param_list):

FILE: benchmark/third_party/DeepSpeed/deepspeed/checkpoint/utils.py
  function get_model_ckpt_name_for_rank (line 8) | def get_model_ckpt_name_for_rank(base_folder, mp_rank_str):
  function get_zero_ckpt_name_for_rank (line 16) | def get_zero_ckpt_name_for_rank(base_folder, dp_rank, mp_rank):
  function get_layer_ckpt_name_for_rank (line 26) | def get_layer_ckpt_name_for_rank(base_folder, layer_id, tp_rank):

FILE: benchmark/third_party/DeepSpeed/deepspeed/checkpoint/zero_checkpoint.py
  class ZeROCheckpoint (line 15) | class ZeROCheckpoint(object):
    method __init__ (line 16) | def __init__(self, dir):
    method get_src_world_size (line 29) | def get_src_world_size(self):
    method get_src_tp_degree (line 32) | def get_src_tp_degree(self):
    method get_src_pp_degree (line 35) | def get_src_pp_degree(self):
    method get_src_dp_degree (line 38) | def get_src_dp_degree(self):
    method get_file_indices_for_rank (line 41) | def get_file_indices_for_rank(self, pp_index, tp_index, dp_index):
    method get_files_for_rank (line 46) | def get_files_for_rank(self, pp_index, tp_index, dp_index):
    method get_state_for_rank (line 50) | def get_state_for_rank(self,
    method print_3d_index_map (line 77) | def print_3d_index_map(self, tag=None):
    method print_3d_file_map (line 83) | def print_3d_file_map(self, tag=None):
    method reshape (line 93) | def reshape(self, target_3d_desc: model_3d_desc):
    method _strip_tensor_paddings (line 97) | def _strip_tensor_paddings(self, sd):
    method _clear_group_paddings (line 117) | def _clear_group_paddings(self, sd):
    method _get_optimizer_state (line 123) | def _get_optimizer_state(self, sd, state_key):
    method _get_param_group_states (line 130) | def _get_param_group_states(self, sd):
    method _update_partition_count (line 141) | def _update_partition_count(self, sd):

FILE: benchmark/third_party/DeepSpeed/deepspeed/comm/__init__.py
  function get_world_group (line 18) | def get_world_group():
  function get_global_rank (line 21) | def get_global_rank(group, group_rank):
  function allgather_fn (line 28) | def allgather_fn(output_tensor, input_tensor, group=None, async_op=False):
  function reduce_scatter_fn (line 34) | def reduce_scatter_fn(output_tensor, input_tensor, group=None, async_op=...
  function configure (line 40) | def configure(deepspeed_config=None,

FILE: benchmark/third_party/DeepSpeed/deepspeed/comm/backend.py
  class Backend (line 21) | class Backend(object):
    method __init__ (line 22) | def __init__(self, name='backend', rank=0, size=1):
    method is_initialized (line 32) | def is_initialized(self):
    method new_group (line 35) | def new_group(self):
    method init_process_group (line 39) | def init_process_group(self):

FILE: benchmark/third_party/DeepSpeed/deepspeed/comm/comm.py
  class ReduceOp (line 36) | class ReduceOp(Enum):
  function _configure_using_config_file (line 77) | def _configure_using_config_file(config):
  function configure (line 82) | def configure(
  function timed_op (line 111) | def timed_op(func):
  function init_deepspeed_backend (line 153) | def init_deepspeed_backend(ds_backend):
  function is_initialized (line 169) | def is_initialized():
  function destroy_process_group (line 177) | def destroy_process_group(group=None):
  function new_group (line 182) | def new_group(ranks):
  function is_available (line 188) | def is_available() -> bool:
  function set_backend (line 198) | def set_backend(backend):
  function broadcast (line 223) | def broadcast(tensor,
  function all_gather (line 235) | def all_gather(tensor_list,
  function has_reduce_scatter_base (line 249) | def has_reduce_scatter_base():
  function reduce_scatter_fn (line 256) | def reduce_scatter_fn(output_tensor,
  function reduce_scatter_base (line 292) | def reduce_scatter_base(output_tensor,
  function all_gather_base (line 309) | def all_gather_base(output_tensor,
  function has_allgather_base (line 323) | def has_allgather_base():
  function allgather_fn (line 330) | def allgather_fn(output_tensor,
  function all_to_all_single (line 360) | def all_to_all_single(output,
  function send (line 379) | def send(tensor,
  function recv (line 391) | def recv(tensor,
  function isend (line 403) | def isend(tensor,
  function irecv (line 415) | def irecv(tensor,
  function gather (line 427) | def gather(tensor,
  function scatter (line 444) | def scatter(tensor,
  function barrier (line 461) | def barrier(group=None,
  function monitored_barrier (line 472) | def monitored_barrier(group=None,
  function log_summary (line 482) | def log_summary():
  function reduce (line 491) | def reduce(tensor,
  function reduce_scatter (line 504) | def reduce_scatter(output,
  function all_reduce (line 521) | def all_reduce(tensor,
  function get_world_group (line 537) | def get_world_group():
  function get_world_size (line 543) | def get_world_size(group=None) -> int:
  function get_rank (line 559) | def get_rank(group=None):
  function get_local_rank (line 578) | def get_local_rank():
  function get_global_rank (line 591) | def get_global_rank(group=None, group_rank=0):
  function init_distributed (line 598) | def init_distributed(dist_backend="nccl",
  function mpi_discovery (line 661) | def mpi_discovery(distributed_port=TORCH_DISTRIBUTED_DEFAULT_PORT, verbo...
  function in_aml (line 705) | def in_aml():
  function in_aws_sm (line 710) | def in_aws_sm():
  function in_dlts (line 715) | def in_dlts():
  function patch_aml_env_for_torch_nccl_backend (line 720) | def patch_aml_env_for_torch_nccl_backend(master_port=6105, verbose=True):
  function patch_aws_sm_env_for_torch_nccl_backend (line 757) | def patch_aws_sm_env_for_torch_nccl_backend(verbose=True):

FILE: benchmark/third_party/DeepSpeed/deepspeed/comm/config.py
  class CommsConfig (line 10) | class CommsConfig(BaseModel):
    class Config (line 11) | class Config:
  class CommsLoggerConfig (line 18) | class CommsLoggerConfig(CommsConfig):
  class DeepSpeedCommsConfig (line 26) | class DeepSpeedCommsConfig:
    method __init__ (line 27) | def __init__(self, ds_config):

FILE: benchmark/third_party/DeepSpeed/deepspeed/comm/torch.py
  class TorchBackend (line 11) | class TorchBackend(Backend):
    method __init__ (line 19) | def __init__(self, backend, timeout, init_method, name='torch'):
    method init_process_group (line 32) | def init_process_group(self, backend, timeout, init_method):
    method all_reduce (line 39) | def all_reduce(self,
    method reduce (line 50) | def reduce(self, tensor, dst, op=ReduceOp.SUM, group=None, async_op=Fa...
    method reduce_scatter (line 57) | def reduce_scatter(self,
    method broadcast (line 69) | def broadcast(self, tensor, src, group=None, async_op=False):
    method all_gather (line 75) | def all_gather(self, tensor_list, tensor, group=None, async_op=False):
    method all_gather_base (line 81) | def all_gather_base(self, output_tensor, input_tensor, group=None, asy...
    method reduce_scatter_base (line 95) | def reduce_scatter_base(self,
    method all_to_all_single (line 114) | def all_to_all_single(self,
    method send (line 128) | def send(self, tensor, dst, group=None, tag=0):
    method recv (line 131) | def recv(self, tensor, src=None, group=None, tag=0):
    method isend (line 134) | def isend(self, tensor, dst, group=None, tag=0):
    method irecv (line 137) | def irecv(self, tensor, src=None, group=None, tag=0):
    method gather (line 140) | def gather(self, tensor, gather_list=None, dst=0, group=None, async_op...
    method scatter (line 147) | def scatter(self, tensor, scatter_list=None, src=0, group=None, async_...
    method barrier (line 154) | def barrier(self,
    method monitored_barrier (line 164) | def monitored_barrier(self,
    method get_rank (line 174) | def get_rank(self, group=None):
    method get_world_size (line 177) | def get_world_size(self, group=None):
    method is_initialized (line 180) | def is_initialized(self):
    method get_backend (line 183) | def get_backend(self, group=None):
    method new_group (line 186) | def new_group(self, ranks):
    method get_global_rank (line 189) | def get_global_rank(self, group, group_rank):
    method get_world_group (line 196) | def get_world_group(self):
    method destroy_process_group (line 199) | def destroy_process_group(self, group=None):
    method _reduce_op (line 202) | def _reduce_op(self, op):

FILE: benchmark/third_party/DeepSpeed/deepspeed/comm/utils.py
  function older_torch (line 7) | def older_torch():
  function has_allgather_base (line 21) | def has_allgather_base():
  function has_reduce_scatter_base (line 28) | def has_reduce_scatter_base():
  function get_local_rank_from_launcher (line 35) | def get_local_rank_from_launcher():
  function get_world_rank_from_launcher (line 50) | def get_world_rank_from_launcher():
  function get_world_size_from_launcher (line 65) | def get_world_size_from_launcher():
  function get_default_args (line 83) | def get_default_args(func):
  function get_tensor_position (line 93) | def get_tensor_position(func):
  function get_tensor_kwarg (line 111) | def get_tensor_kwarg(func, kwargs):
  function get_msg_size_from_args (line 125) | def get_msg_size_from_args(func, *args, **kwargs):
  function get_debug_log_name (line 152) | def get_debug_log_name(func_args, debug):

FILE: benchmark/third_party/DeepSpeed/deepspeed/compression/basic_layer.py
  class QuantAct (line 12) | class QuantAct(nn.Module):
    method __init__ (line 22) | def __init__(self, act_range_momentum=0.95, quant_mode='symmetric'):
    method forward (line 34) | def forward(self, x, num_bits, *args):
  class Embedding_Compress (line 61) | class Embedding_Compress(nn.Embedding):
    method __init__ (line 62) | def __init__(self, *kargs):
    method extra_repr (line 70) | def extra_repr(self):
    method enable_weight_quantization (line 76) | def enable_weight_quantization(self,
    method fix_weight_quantization (line 105) | def fix_weight_quantization(self):
    method forward (line 114) | def forward(self, input):
  class LinearLayer_Compress (line 134) | class LinearLayer_Compress(nn.Linear):
    method __init__ (line 138) | def __init__(self, *kargs, bias=True):
    method extra_repr (line 154) | def extra_repr(self):
    method enable_sparse_pruning (line 159) | def enable_sparse_pruning(self, ratio, method):
    method enable_row_pruning (line 179) | def enable_row_pruning(self, ratio, method):
    method enable_head_pruning (line 200) | def enable_head_pruning(self, ratio, method, num_heads):
    method fix_sparse_pruning_helper (line 217) | def fix_sparse_pruning_helper(self):
    method fix_row_col_pruning_helper (line 227) | def fix_row_col_pruning_helper(self, mask=None, dim_reduction=False):
    method fix_head_pruning_helper (line 269) | def fix_head_pruning_helper(self, mask=None, num_heads=None, dim_reduc...
    method get_mask (line 312) | def get_mask(self, pruning_type='row'):
    method enable_weight_quantization (line 341) | def enable_weight_quantization(self,
    method fix_weight_quantization (line 369) | def fix_weight_quantization(self):
    method enable_activation_quantization (line 378) | def enable_activation_quantization(self, bits, quantization_type, rang...
    method head_pruning_reshape (line 390) | def head_pruning_reshape(self, w, mask):
    method forward (line 397) | def forward(self, input, skip_bias_add=False):
  class Conv2dLayer_Compress (line 444) | class Conv2dLayer_Compress(nn.Conv2d):
    method __init__ (line 448) | def __init__(self, *kargs):
    method __repr__ (line 461) | def __repr__(self):
    method enable_sparse_pruning (line 484) | def enable_sparse_pruning(self, ratio, method):
    method enable_channel_pruning (line 503) | def enable_channel_pruning(self, ratio, method):
    method fix_sparse_pruning_helper (line 529) | def fix_sparse_pruning_helper(self):
    method fix_channel_pruning_helper (line 539) | def fix_channel_pruning_helper(self, mask=None, dim_reduction=False):
    method get_mask (line 575) | def get_mask(self, pruning_type='sparse'):
    method fix_weight_quantization (line 597) | def fix_weight_quantization(self):
    method enable_weight_quantization (line 606) | def enable_weight_quantization(self,
    method enable_activation_quantization (line 628) | def enable_activation_quantization(self, bits, quantization_type, rang...
    method forward (line 640) | def forward(self, input):
  class BNLayer_Compress (line 683) | class BNLayer_Compress(nn.BatchNorm2d):
    method fix_channel_pruning_helper (line 684) | def fix_channel_pruning_helper(self, mask, dim_reduction=True):
  function _reduce (line 691) | def _reduce(input_):
  function split_tensor_along_last_dim (line 705) | def split_tensor_along_last_dim(tensor, num_partitions, contiguous_split...
  function _split (line 726) | def _split(input_):
  function _gather (line 746) | def _gather(input_):
  class _CopyToModelParallelRegion (line 769) | class _CopyToModelParallelRegion(torch.autograd.Function):
    method forward (line 772) | def forward(ctx, input_):
    method backward (line 776) | def backward(ctx, grad_output):
  class _ReduceFromModelParallelRegion (line 780) | class _ReduceFromModelParallelRegion(torch.autograd.Function):
    method forward (line 783) | def forward(ctx, input_):
    method backward (line 787) | def backward(ctx, grad_output):
  class _ScatterToModelParallelRegion (line 791) | class _ScatterToModelParallelRegion(torch.autograd.Function):
    method forward (line 794) | def forward(ctx, input_):
    method backward (line 798) | def backward(ctx, grad_output):
  class _GatherFromModelParallelRegion (line 802) | class _GatherFromModelParallelRegion(torch.autograd.Function):
    method forward (line 805) | def forward(ctx, input_):
    method backward (line 809) | def backward(ctx, grad_output):
  function copy_to_model_parallel_region (line 818) | def copy_to_model_parallel_region(input_):
  function reduce_from_model_parallel_region (line 822) | def reduce_from_model_parallel_region(input_):
  function scatter_to_model_parallel_region (line 826) | def scatter_to_model_parallel_region(input_):
  function gather_from_model_parallel_region (line 830) | def gather_from_model_parallel_region(input_):
  class ColumnParallelLinear_Compress (line 834) | class ColumnParallelLinear_Compress(LinearLayer_Compress):
    method __init__ (line 835) | def __init__(self,
    method forward (line 860) | def forward(self, input_):
  class RowParallelLinear_Compress (line 877) | class RowParallelLinear_Compress(LinearLayer_Compress):
    method __init__ (line 878) | def __init__(self,
    method forward (line 903) | def forward(self, input_):

FILE: benchmark/third_party/DeepSpeed/deepspeed/compression/compress.py
  function check_deepspeed_config (line 10) | def check_deepspeed_config(config):
  function get_module_name (line 23) | def get_module_name(group_name,
  function get_compress_methods (line 49) | def get_compress_methods(model, compress_methods, mpu=None):
  function init_compression (line 97) | def init_compression(model, deepspeed_config, teacher_model=None, mpu=No...
  function redundancy_clean (line 127) | def redundancy_clean(model, deepspeed_config, mpu=None):
  function student_initialization (line 182) | def student_initialization(student_model, teacher_model, deepspeed_config):

FILE: benchmark/third_party/DeepSpeed/deepspeed/compression/config.py
  function get_compression_config (line 6) | def get_compression_config(param_dict):
  function get_layer_reduction (line 25) | def get_layer_reduction(param_dict):
  function get_layer_reduction_enabled (line 35) | def get_layer_reduction_enabled(param_dict):
  function get_layer_reduction_params (line 44) | def get_layer_reduction_params(param_dict):
  function get_quantize_enabled (line 53) | def get_quantize_enabled(param_dict):
  function get_weight_quantization (line 62) | def get_weight_quantization(param_dict):
  function get_weight_quantization_shared_parameters (line 76) | def get_weight_quantization_shared_parameters(param_dict):
  function get_weight_quantization_different_groups (line 140) | def get_weight_quantization_different_groups(param_dict):
  function get_activation_quantization (line 170) | def get_activation_quantization(param_dict):
  function get_activation_quantization_shared_parameters (line 189) | def get_activation_quantization_shared_parameters(param_dict):
  function get_activation_quantization_different_groups (line 220) | def get_activation_quantization_different_groups(param_dict):
  function get_sparse_pruning (line 245) | def get_sparse_pruning(param_dict):
  function get_sparse_pruning_shared_parameters (line 259) | def get_sparse_pruning_shared_parameters(param_dict):
  function get_sparse_pruning_different_groups (line 282) | def get_sparse_pruning_different_groups(param_dict):
  function get_row_pruning (line 307) | def get_row_pruning(param_dict):
  function get_row_pruning_shared_parameters (line 321) | def get_row_pruning_shared_parameters(param_dict):
  function get_row_pruning_different_groups (line 343) | def get_row_pruning_different_groups(param_dict):
  function get_head_pruning (line 367) | def get_head_pruning(param_dict):
  function get_head_pruning_shared_parameters (line 381) | def get_head_pruning_shared_parameters(param_dict):
  function get_head_pruning_different_groups (line 406) | def get_head_pruning_different_groups(param_dict):
  function get_channel_pruning (line 430) | def get_channel_pruning(param_dict):
  function get_channel_pruning_shared_parameters (line 444) | def get_channel_pruning_shared_parameters(param_dict):
  function get_channel_pruning_different_groups (line 468) | def get_channel_pruning_different_groups(param_dict):

FILE: benchmark/third_party/DeepSpeed/deepspeed/compression/helper.py
  function recursive_getattr (line 6) | def recursive_getattr(model, module_name):
  function recursive_setattr (line 22) | def recursive_setattr(model, module_name, module):
  function module_replacement (line 40) | def module_replacement(model, module_name, compression_technique=None, m...
  function is_module_compressible (line 189) | def is_module_compressible(module, mpu=None):
  function compression_preparation (line 204) | def compression_preparation(model, compression_techinique_list, mpu):
  function fix_compression (line 226) | def fix_compression(model,
  function convert_conv1d_to_linear (line 259) | def convert_conv1d_to_linear(model, convert_type):

FILE: benchmark/third_party/DeepSpeed/deepspeed/compression/scheduler.py
  class compression_scheduler (line 7) | class compression_scheduler():
    method __init__ (line 11) | def __init__(self, model, compression_config):
    method make_init (line 27) | def make_init(self):
    method check_weight_quantization (line 56) | def check_weight_quantization(self):
    method check_activation_quantization (line 75) | def check_activation_quantization(self):
    method check_sparse_pruning (line 93) | def check_sparse_pruning(self):
    method check_head_pruning (line 110) | def check_head_pruning(self):
    method check_row_pruning (line 126) | def check_row_pruning(self):
    method check_channel_pruning (line 142) | def check_channel_pruning(self):
    method check_all_modules (line 159) | def check_all_modules(self):
    method step (line 168) | def step(self, step_zero_check=False):

FILE: benchmark/third_party/DeepSpeed/deepspeed/compression/utils.py
  class TopKBinarizer (line 6) | class TopKBinarizer(autograd.Function):
    method forward (line 15) | def forward(ctx, inputs: torch.tensor, threshold: float, sigmoid: bool):
    method backward (line 48) | def backward(ctx, gradOutput):
  class SymQuantizer (line 56) | class SymQuantizer(torch.autograd.Function):
    method forward (line 61) | def forward(ctx, input, num_bits, min_value=None, max_value=None, num_...
    method backward (line 93) | def backward(ctx, grad_output):
  class AsymQuantizer (line 98) | class AsymQuantizer(torch.autograd.Function):
    method forward (line 103) | def forward(ctx, input, num_bits, min_value=None, max_value=None, num_...
    method backward (line 139) | def backward(ctx, grad_output):
  class TernaryQuantizer (line 144) | class TernaryQuantizer(torch.autograd.Function):
    method forward (line 149) | def forward(ctx, input, num_bits, min_value=None, max_value=None, num_...
    method backward (line 179) | def backward(ctx, grad_output):
  class BinaryQuantizer (line 184) | class BinaryQuantizer(torch.autograd.Function):
    method forward (line 189) | def forward(ctx, input, num_bits, min_value=None, max_value=None, num_...
    method backward (line 214) | def backward(ctx, grad_output):

FILE: benchmark/third_party/DeepSpeed/deepspeed/elasticity/config.py
  class ElasticityError (line 9) | class ElasticityError(Exception):
  class ElasticityConfigError (line 15) | class ElasticityConfigError(ElasticityError):
  class ElasticityIncompatibleWorldSize (line 21) | class ElasticityIncompatibleWorldSize(ElasticityError):
  class ElasticityConfig (line 27) | class ElasticityConfig:
    method __init__ (line 46) | def __init__(self, param_dict):
    method repr (line 118) | def repr(self):
    method __repr__ (line 121) | def __repr__(self):

FILE: benchmark/third_party/DeepSpeed/deepspeed/elasticity/elastic_agent.py
  class DSElasticAgent (line 23) | class DSElasticAgent(LocalElasticAgent):
    method __init__ (line 24) | def __init__(
    method _set_master_addr_port (line 36) | def _set_master_addr_port(store: Store,
    method _start_workers (line 52) | def _start_workers(self, worker_group: WorkerGroup) -> Dict[int, Any]:
    method _invoke_run (line 115) | def _invoke_run(self, role: str = "default") -> RunResult:

FILE: benchmark/third_party/DeepSpeed/deepspeed/elasticity/elasticity.py
  function get_candidate_batch_sizes (line 61) | def get_candidate_batch_sizes(base_list, max_acceptable_batch_size):
  function get_valid_gpus (line 75) | def get_valid_gpus(batch_size, micro_batches, min_valid_gpus, max_valid_...
  function get_best_candidates (line 97) | def get_best_candidates(candidate_batch_sizes,
  function _get_compatible_gpus_v01 (line 125) | def _get_compatible_gpus_v01(micro_batches,
  function _get_compatible_gpus_v02 (line 173) | def _get_compatible_gpus_v02(micro_batches,
  function _compatible_ds_version_check (line 237) | def _compatible_ds_version_check(target_deepspeed_version: str):
  function elasticity_enabled (line 248) | def elasticity_enabled(ds_config: dict):
  function ensure_immutable_elastic_config (line 254) | def ensure_immutable_elastic_config(runtime_elastic_config_dict: dict):
  function compute_elastic_config (line 287) | def compute_elastic_config(ds_config: dict,

FILE: benchmark/third_party/DeepSpeed/deepspeed/elasticity/utils.py
  function is_torch_elastic_compatible (line 4) | def is_torch_elastic_compatible():

FILE: benchmark/third_party/DeepSpeed/deepspeed/env_report.py
  function op_report (line 23) | def op_report(verbose=True):
  function ninja_installed (line 54) | def ninja_installed():
  function nvcc_version (line 62) | def nvcc_version():
  function debug_report (line 79) | def debug_report():
  function parse_arguments (line 112) | def parse_arguments():
  function main (line 127) | def main(hide_operator_status=False, hide_errors_and_warnings=False):
  function cli_main (line 133) | def cli_main():

FILE: benchmark/third_party/DeepSpeed/deepspeed/inference/config.py
  class DtypeEnum (line 10) | class DtypeEnum(Enum):
    method __new__ (line 20) | def __new__(cls, *values):
    method __repr__ (line 29) | def __repr__(self):
  class MoETypeEnum (line 37) | class MoETypeEnum(str, Enum):
  class DeepSpeedTPConfig (line 42) | class DeepSpeedTPConfig(DeepSpeedConfigModel):
  class DeepSpeedMoEConfig (line 60) | class DeepSpeedMoEConfig(DeepSpeedConfigModel):
  class QuantTypeEnum (line 83) | class QuantTypeEnum(str, Enum):
  class BaseQuantConfig (line 88) | class BaseQuantConfig(DeepSpeedConfigModel):
  class WeightQuantConfig (line 95) | class WeightQuantConfig(BaseQuantConfig):
  class ActivationQuantConfig (line 99) | class ActivationQuantConfig(BaseQuantConfig):
  class QKVQuantConfig (line 103) | class QKVQuantConfig(DeepSpeedConfigModel):
  class QuantizationConfig (line 107) | class QuantizationConfig(DeepSpeedConfigModel):
  class InferenceCheckpointConfig (line 115) | class InferenceCheckpointConfig(DeepSpeedConfigModel):
  class DeepSpeedInferenceConfig (line 121) | class DeepSpeedInferenceConfig(DeepSpeedConfigModel):
    method moe_backward_compat (line 269) | def moe_backward_compat(cls, field_value, values):
    class Config (line 274) | class Config:

FILE: benchmark/third_party/DeepSpeed/deepspeed/inference/engine.py
  class InferenceEngine (line 32) | class InferenceEngine(Module):
    method __init__ (line 37) | def __init__(self, model, config):
    method profile_model_time (line 137) | def profile_model_time(self, use_cuda_events=True):
    method _get_model_config_generate (line 147) | def _get_model_config_generate(self, config):
    method remove_mask_prepare_for_bloom (line 155) | def remove_mask_prepare_for_bloom(self):
    method _pre_forward_hook (line 160) | def _pre_forward_hook(self, module, *inputs, **kwargs):
    method _post_forward_hook (line 167) | def _post_forward_hook(self, module, input, output):
    method _create_model_parallel_group (line 177) | def _create_model_parallel_group(self, config):
    method _create_ep_parallel_group (line 190) | def _create_ep_parallel_group(self, moe_experts):
    method _init_quantization_setting (line 220) | def _init_quantization_setting(self, quantization_setting):
    method _validate_args (line 236) | def _validate_args(self, mpu, replace_with_kernel_inject):
    method load_model_with_checkpoint (line 268) | def load_model_with_checkpoint(self, r_module):
    method _apply_injection_policy (line 335) | def _apply_injection_policy(self, config, client_module=None):
    method _get_all_ckpt_names (line 355) | def _get_all_ckpt_names(self, checkpoints_path, tag):
    method _get_ckpt_name (line 365) | def _get_ckpt_name(self, checkpoints_path, tag, mp_placeholder=None):
    method _load_checkpoint (line 378) | def _load_checkpoint(self, load_dir, load_module_strict=True, tag=None):
    method _choose_module_key (line 439) | def _choose_module_key(self, sd):
    method _convert_to_dtype (line 447) | def _convert_to_dtype(self, config):
    method _create_cuda_graph (line 464) | def _create_cuda_graph(self, *inputs, **kwargs):
    method _graph_replay (line 483) | def _graph_replay(self, *inputs, **kwargs):
    method model_times (line 493) | def model_times(self):
    method forward (line 505) | def forward(self, *inputs, **kwargs):

FILE: benchmark/third_party/DeepSpeed/deepspeed/launcher/launch.py
  function parse_args (line 31) | def parse_args():
  function terminate_process_tree (line 115) | def terminate_process_tree(pid):
  function main (line 129) | def main():

FILE: benchmark/third_party/DeepSpeed/deepspeed/launcher/multinode_runner.py
  class MultiNodeRunner (line 13) | class MultiNodeRunner(ABC):
    method __init__ (line 14) | def __init__(self, args, world_info_base64):
    method backend_exists (line 23) | def backend_exists(self):
    method get_cmd (line 27) | def get_cmd(self, environment, active_resources):
    method add_export (line 30) | def add_export(self, key, var):
    method parse_user_args (line 33) | def parse_user_args(self):
    method name (line 37) | def name(self):
    method validate_args (line 41) | def validate_args(self):
  class PDSHRunner (line 45) | class PDSHRunner(MultiNodeRunner):
    method __init__ (line 46) | def __init__(self, args, world_info_base64):
    method backend_exists (line 49) | def backend_exists(self):
    method name (line 53) | def name(self):
    method parse_user_args (line 56) | def parse_user_args(self):
    method get_cmd (line 61) | def get_cmd(self, environment, active_resources):
  class OpenMPIRunner (line 109) | class OpenMPIRunner(MultiNodeRunner):
    method __init__ (line 110) | def __init__(self, args, world_info_base64, resource_pool):
    method backend_exists (line 115) | def backend_exists(self):
    method name (line 120) | def name(self):
    method validate_args (line 123) | def validate_args(self):
    method get_cmd (line 133) | def get_cmd(self, environment, active_resources):
  class SlurmRunner (line 164) | class SlurmRunner(MultiNodeRunner):
    method __init__ (line 165) | def __init__(self, args, world_info_base64, resource_pool):
    method backend_exists (line 169) | def backend_exists(self):
    method name (line 173) | def name(self):
    method get_cmd (line 176) | def get_cmd(self, environment, active_resources):
  class MVAPICHRunner (line 211) | class MVAPICHRunner(MultiNodeRunner):
    method __init__ (line 212) | def __init__(self, args, world_info_base64, resource_pool):
    method backend_exists (line 235) | def backend_exists(self):
    method name (line 253) | def name(self):
    method validate_args (line 256) | def validate_args(self):
    method get_cmd (line 266) | def get_cmd(self, environment, active_resources):

FILE: benchmark/third_party/DeepSpeed/deepspeed/launcher/runner.py
  function parse_args (line 37) | def parse_args(args=None):
  function fetch_hostfile (line 177) | def fetch_hostfile(hostfile_path):
  function _stable_remove_duplicates (line 208) | def _stable_remove_duplicates(data):
  function parse_resource_filter (line 218) | def parse_resource_filter(host_info, include_str="", exclude_str=""):
  function parse_inclusion_exclusion (line 308) | def parse_inclusion_exclusion(resource_pool, inclusion, exclusion):
  function encode_world_info (line 318) | def encode_world_info(world_info):
  function run_autotuning (line 324) | def run_autotuning(args, active_resources):
  function parse_num_nodes (line 338) | def parse_num_nodes(str_num_nodes: str, elastic_training: bool):
  function main (line 353) | def main(args=None):

FILE: benchmark/third_party/DeepSpeed/deepspeed/model_implementations/diffusers/unet.py
  class DSUNet (line 7) | class DSUNet(torch.nn.Module):
    method __init__ (line 8) | def __init__(self, unet, enable_cuda_graph=True):
    method _graph_replay (line 22) | def _graph_replay(self, *inputs, **kwargs):
    method forward (line 32) | def forward(self, *inputs, **kwargs):
    method _create_cuda_graph (line 43) | def _create_cuda_graph(self, *inputs, **kwargs):
    method _forward (line 62) | def _forward(self, sample, timestamp, encoder_hidden_states, return_di...

FILE: benchmark/third_party/DeepSpeed/deepspeed/model_implementations/diffusers/vae.py
  class DSVAE (line 7) | class DSVAE(torch.nn.Module):
    method __init__ (line 8) | def __init__(self, vae, enable_cuda_graph=True):
    method _graph_replay_decoder (line 19) | def _graph_replay_decoder(self, *inputs, **kwargs):
    method _decode (line 29) | def _decode(self, x, return_dict=True):
    method _create_cuda_graph_decoder (line 32) | def _create_cuda_graph_decoder(self, *inputs, **kwargs):
    method decode (line 52) | def decode(self, *inputs, **kwargs):
    method _graph_replay_encoder (line 63) | def _graph_replay_encoder(self, *inputs, **kwargs):
    method _encode (line 73) | def _encode(self, x, return_dict=True):
    method _create_cuda_graph_encoder (line 76) | def _create_cuda_graph_encoder(self, *inputs, **kwargs):
    method encode (line 96) | def encode(self, *inputs, **kwargs):
    method _graph_replay_all (line 107) | def _graph_replay_all(self, *inputs, **kwargs):
    method forward (line 117) | def forward(self, *inputs, **kwargs):
    method _create_cuda_graph (line 128) | def _create_cuda_graph(self, *inputs, **kwargs):
    method _forward (line 147) | def _forward(self, sample, timestamp, encoder_hidden_states, return_di...

FILE: benchmark/third_party/DeepSpeed/deepspeed/model_implementations/transformers/clip_encoder.py
  class DSClipEncoder (line 7) | class DSClipEncoder(torch.nn.Module):
    method __init__ (line 8) | def __init__(self, enc, enable_cuda_graph=False):
    method _build_causal_attention_mask (line 23) | def _build_causal_attention_mask(self, bsz, seq_len, dtype):
    method _graph_replay (line 34) | def _graph_replay(self, *inputs, **kwargs):
    method forward (line 44) | def forward(self, *inputs, **kwargs):
    method _create_cuda_graph (line 56) | def _create_cuda_graph(self, *inputs, **kwargs):
    method _forward (line 77) | def _forward(self, *inputs, **kwargs):

FILE: benchmark/third_party/DeepSpeed/deepspeed/model_implementations/transformers/ds_transformer.py
  class DeepSpeedTransformerInference (line 17) | class DeepSpeedTransformerInference(nn.Module):
    method __init__ (line 35) | def __init__(self,
    method forward (line 84) | def forward(

FILE: benchmark/third_party/DeepSpeed/deepspeed/module_inject/inject.py
  function module_inject (line 6) | def module_inject(layer_obj,
  function test_hi (line 84) | def test_hi():

FILE: benchmark/third_party/DeepSpeed/deepspeed/module_inject/layers.py
  class LinearAllreduce (line 9) | class LinearAllreduce(nn.Module):
    method __init__ (line 10) | def __init__(self, weight, bias=None, mp_group=None):
    method forward (line 16) | def forward(self, input):
  class LinearLayer (line 25) | class LinearLayer(nn.Module):
    method __init__ (line 26) | def __init__(self, weight_shape=None, dtype=torch.half, weight=None, b...
    method forward (line 43) | def forward(self, input):
  class Normalize (line 50) | class Normalize(nn.Module):
    method __init__ (line 51) | def __init__(self, dim, dtype=torch.float, eps=1e-5):
    method forward (line 57) | def forward(self, input):
  class EmbeddingLayer (line 61) | class EmbeddingLayer(nn.Module):
    method __init__ (line 62) | def __init__(self, weight_shape, dtype=torch.half):
    method forward (line 70) | def forward(self, input):
  class OPTEmbedding (line 74) | class OPTEmbedding(EmbeddingLayer):
    method __init__ (line 78) | def __init__(self, weight_shape):
    method forward (line 84) | def forward(self, attention_mask: torch.LongTensor, past_key_values_le...

FILE: benchmark/third_party/DeepSpeed/deepspeed/module_inject/load_checkpoint.py
  function load_model_with_checkpoint (line 9) | def load_model_with_checkpoint(r_module,

FILE: benchmark/third_party/DeepSpeed/deepspeed/module_inject/module_quantize.py
  function quantize_transformer_layer (line 4) | def quantize_transformer_layer(orig_layer_impl, model, megatron=False, p...
  function quantize_module (line 64) | def quantize_module(model, orig_class, quantize_fn):
  function _quantize_module (line 69) | def _quantize_module(model, policies):

FILE: benchmark/third_party/DeepSpeed/deepspeed/module_inject/replace_module.py
  class ReplaceWithTensorSlicing (line 21) | class ReplaceWithTensorSlicing:
    method __init__ (line 22) | def __init__(self, mp_group=None, mp_size=1, out_dim=1, in_dim=0):
    method merge_assert (line 31) | def merge_assert(self, dim1, dim2):
    method qkv_copy (line 37) | def qkv_copy(self, dst, src):
    method copy (line 87) | def copy(self, dst, src):
  function get_transformer_name (line 126) | def get_transformer_name(replaced_module):
  class GroupQuantizer (line 141) | class GroupQuantizer:
    method __init__ (line 142) | def __init__(self, q_int8=True, group_size=1, num_bits=8):
    method quantize (line 147) | def quantize(self, inputs, qkv=True, count=1, parallel_dim=0):
  function _module_match (line 192) | def _module_match(module):
  function generic_injection (line 200) | def generic_injection(module, fp16=False, enable_cuda_graph=True):
  function replace_transformer_layer (line 296) | def replace_transformer_layer(orig_layer_impl,
  function revert_transformer_layer (line 1074) | def revert_transformer_layer(orig_layer_impl, model, config, preln=False):
  function replace_module (line 1142) | def replace_module(model, orig_class, replace_fn, _replace_policy):
  function _replace_module (line 1175) | def _replace_module(model, policies, layer_id=0):

FILE: benchmark/third_party/DeepSpeed/deepspeed/module_inject/replace_policy.py
  class DSPolicy (line 15) | class DSPolicy(ABC):
    method __init__ (line 18) | def __init__(self):
    method attention (line 21) | def attention(self):
  class UNetPolicy (line 30) | class UNetPolicy(DSPolicy):
    method __init__ (line 31) | def __init__(self):
    method match (line 39) | def match(self, module):
    method apply (line 42) | def apply(self, module, enable_cuda_graph=True):
    method attention (line 47) | def attention(self, client_module):
  class VAEPolicy (line 71) | class VAEPolicy(DSPolicy):
    method __init__ (line 72) | def __init__(self):
    method match (line 80) | def match(self, module):
    method apply (line 83) | def apply(self, module, enable_cuda_graph=True):
  class TransformerPolicy (line 89) | class TransformerPolicy(DSPolicy):
    method __init__ (line 94) | def __init__(
    method attention (line 118) | def attention(self):
    method get_hidden_heads (line 126) | def get_hidden_heads(self):
    method mlp (line 132) | def mlp(self):
    method layerNorm (line 140) | def layerNorm(self):
    method get_param_names (line 148) | def get_param_names(self):
  class HFBertLayerPolicy (line 174) | class HFBertLayerPolicy(TransformerPolicy):
    method __init__ (line 175) | def __init__(self, client_module, inference=False):
    method get_hidden_heads (line 190) | def get_hidden_heads(self):
    method attention (line 194) | def attention(self):
    method mlp (line 213) | def mlp(self):
    method layerNorm (line 223) | def layerNorm(self):
  class HFCLIPLayerPolicy (line 236) | class HFCLIPLayerPolicy(TransformerPolicy):
    method __init__ (line 237) | def __init__(self, client_module, inference=False):
    method get_hidden_heads (line 249) | def get_hidden_heads(self):
    method attention (line 253) | def attention(self):
    method mlp (line 272) | def mlp(self):
    method layerNorm (line 279) | def layerNorm(self):
  class HFGPTNEOLayerPolicy (line 286) | class HFGPTNEOLayerPolicy(TransformerPolicy):
    method __init__ (line 287) | def __init__(self, client_module, inference=True):
    method get_hidden_heads (line 296) | def get_hidden_heads(self):
    method attention (line 300) | def attention(self):
    method mlp (line 315) | def mlp(self):
    method layerNorm (line 322) | def layerNorm(self):
    method get_param_names (line 328) | def get_param_names(self):
  class HFGPTJLayerPolicy (line 345) | class HFGPTJLayerPolicy(TransformerPolicy):
    method __init__ (line 348) | def __init__(self, client_module, inference=True):
    method get_hidden_heads (line 357) | def get_hidden_heads(self):
    method attention (line 361) | def attention(self):
    method mlp (line 376) | def mlp(self):
    method layerNorm (line 383) | def layerNorm(self):
    method get_param_names (line 389) | def get_param_names(self):
  class MegatronLayerPolicy (line 404) | class MegatronLayerPolicy(TransformerPolicy):
    method __init__ (line 410) | def __init__(self, client_module, inference=True):
    method get_hidden_heads (line 425) | def get_hidden_heads(self):
    method attention (line 429) | def attention(self):
    method mlp (line 444) | def mlp(self, moe_type='standard'):
    method layerNorm (line 478) | def layerNorm(self):
  class HFGPT2LayerPolicy (line 485) | class HFGPT2LayerPolicy(TransformerPolicy):
    method __init__ (line 488) | def __init__(self, client_module, inference=True):
    method get_hidden_heads (line 498) | def get_hidden_heads(self):
    method attention (line 502) | def attention(self):
    method mlp (line 511) | def mlp(self):
    method layerNorm (line 518) | def layerNorm(self):
  class BLOOMLayerPolicy (line 525) | class BLOOMLayerPolicy(TransformerPolicy):
    method __init__ (line 528) | def __init__(self,
    method get_hidden_heads (line 544) | def get_hidden_heads(self):
    method attention (line 548) | def attention(self):
    method mlp (line 557) | def mlp(self):
    method layerNorm (line 564) | def layerNorm(self):
    method get_param_names (line 570) | def get_param_names(self):
  class GPTNEOXLayerPolicy (line 587) | class GPTNEOXLayerPolicy(TransformerPolicy):
    method __init__ (line 591) | def __init__(self, client_module, inference=True, megatron_v2=True, sp...
    method get_hidden_heads (line 604) | def get_hidden_heads(self):
    method attention (line 613) | def attention(self):
    method mlp (line 627) | def mlp(self):
    method layerNorm (line 634) | def layerNorm(self):
    method get_param_names (line 640) | def get_param_names(self):
  class HFOPTLayerPolicy (line 657) | class HFOPTLayerPolicy(TransformerPolicy):
    method __init__ (line 660) | def __init__(self, client_module, inference=True, use_load_prefix=True):
    method get_hidden_heads (line 675) | def get_hidden_heads(self):
    method attention (line 679) | def attention(self):
    method mlp (line 700) | def mlp(self):
    method layerNorm (line 707) | def layerNorm(self):
    method get_param_names (line 713) | def get_param_names(self):

FILE: benchmark/third_party/DeepSpeed/deepspeed/moe/experts.py
  class Experts (line 9) | class Experts(torch.nn.Module):
    method __init__ (line 10) | def __init__(self, expert, num_local_experts=1, expert_group_name=None):
    method forward (line 24) | def forward(self, inputs):

FILE: benchmark/third_party/DeepSpeed/deepspeed/moe/layer.py
  class MoE (line 15) | class MoE(torch.nn.Module):
    method __init__ (line 34) | def __init__(self,
    method set_deepspeed_parallelism (line 87) | def set_deepspeed_parallelism(self):
    method _create_process_groups (line 90) | def _create_process_groups(self):
    method forward (line 108) | def forward(self, hidden_states, used_token=None):

FILE: benchmark/third_party/DeepSpeed/deepspeed/moe/mappings.py
  function _gather_tokens (line 27) | def _gather_tokens(input_, dim=0):
  function _drop_tokens (line 50) | def _drop_tokens(input_, dim=0):
  class _GatherTokens (line 62) | class _GatherTokens(torch.autograd.Function):
    method symbolic (line 65) | def symbolic(graph, input_, dim):
    method forward (line 69) | def forward(ctx, input_, dim):
    method backward (line 74) | def backward(ctx, grad_output):
  class _DropTokens (line 78) | class _DropTokens(torch.autograd.Function):
    method symbolic (line 82) | def symbolic(graph, input_, dim):
    method forward (line 86) | def forward(ctx, input_, dim):
    method backward (line 91) | def backward(ctx, input_):
  function gather_tokens (line 95) | def gather_tokens(input_, dim=0):
  function drop_tokens (line 103) | def drop_tokens(input_, dim=0):

FILE: benchmark/third_party/DeepSpeed/deepspeed/moe/sharded_moe.py
  function multiplicative_jitter (line 46) | def multiplicative_jitter(x, device: torch.device, epsilon=1e-2):
  function gumbel_rsample (line 72) | def gumbel_rsample(shape: Tuple, device: torch.device) -> Tensor:
  class _AllToAll (line 89) | class _AllToAll(torch.autograd.Function):
    method forward (line 91) | def forward(
    method backward (line 103) | def backward(ctx: Any, *grad_output: Tensor) -> Tuple[None, Tensor]:
  function einsum (line 114) | def einsum(rule, a, b):
  function _capacity (line 155) | def _capacity(gates: Tensor, capacity_factor: Tensor, min_capacity: Tens...
  function _top_idx (line 168) | def _top_idx(source, k):
  function _one_hot_to_float (line 173) | def _one_hot_to_float(x, num_classes):
  function top1gating (line 177) | def top1gating(logits: Tensor,
  function top2gating (line 278) | def top2gating(logits: Tensor,
  class TopKGate (line 351) | class TopKGate(Module):
    method __init__ (line 369) | def __init__(self,
    method forward (line 396) | def forward(
  class MOELayer (line 439) | class MOELayer(Base):
    method __init__ (line 456) | def __init__(self,
    method _set_ep_group (line 488) | def _set_ep_group(self, ep_group):
    method forward (line 491) | def forward(self, *input: Tensor, **kwargs: Any) -> Tensor:

FILE: benchmark/third_party/DeepSpeed/deepspeed/moe/utils.py
  function has_moe_layers (line 6) | def has_moe_layers(m):
  function is_moe_param (line 18) | def is_moe_param(param: torch.Tensor) -> bool:
  function split_params_into_shared_and_expert_params (line 24) | def split_params_into_shared_and_expert_params(
  function split_params_grads_into_shared_and_expert_params (line 36) | def split_params_grads_into_shared_and_expert_params(
  function split_params_into_different_moe_groups_for_optimizer (line 62) | def split_params_into_different_moe_groups_for_optimizer(param_groups: T...

FILE: benchmark/third_party/DeepSpeed/deepspeed/monitor/config.py
  class MonitorConfig (line 10) | class MonitorConfig(BaseModel):
    class Config (line 11) | class Config:
  class TensorBoardConfig (line 18) | class TensorBoardConfig(MonitorConfig):
  class WandbConfig (line 24) | class WandbConfig(MonitorConfig):
  class CSVConfig (line 31) | class CSVConfig(MonitorConfig):
  class DeepSpeedMonitorConfig (line 37) | class DeepSpeedMonitorConfig:
    method __init__ (line 38) | def __init__(self, ds_config):

FILE: benchmark/third_party/DeepSpeed/deepspeed/monitor/csv_monitor.py
  class csvMonitor (line 7) | class csvMonitor(Monitor):
    method __init__ (line 8) | def __init__(self, monitor_config):
    method setup_log_dir (line 16) | def setup_log_dir(self, base=os.path.join(os.path.expanduser("~"), "cs...
    method write_events (line 34) | def write_events(self, event_list):

FILE: benchmark/third_party/DeepSpeed/deepspeed/monitor/monitor.py
  class Monitor (line 9) | class Monitor(ABC):
    method __init__ (line 11) | def __init__(self, monitor_config):
    method write_events (line 15) | def write_events(self, event_list):
  class MonitorMaster (line 24) | class MonitorMaster(Monitor):
    method __init__ (line 25) | def __init__(self, monitor_config):
    method write_events (line 40) | def write_events(self, event_list):

FILE: benchmark/third_party/DeepSpeed/deepspeed/monitor/tensorboard.py
  class TensorBoardMonitor (line 8) | class TensorBoardMonitor(Monitor):
    method __init__ (line 9) | def __init__(self, monitor_config):
    method get_summary_writer (line 21) | def get_summary_writer(self,
    method write_events (line 43) | def write_events(self, event_list, flush=True):
    method flush (line 50) | def flush(self):

FILE: benchmark/third_party/DeepSpeed/deepspeed/monitor/utils.py
  function check_tb_availability (line 1) | def check_tb_availability():
  function check_wandb_availability (line 11) | def check_wandb_availability():

FILE: benchmark/third_party/DeepSpeed/deepspeed/monitor/wandb.py
  class WandbMonitor (line 7) | class WandbMonitor(Monitor):
    method __init__ (line 8) | def __init__(self, monitor_config):
    method log (line 21) | def log(self, data, step=None, commit=None, sync=None):
    method write_events (line 26) | def write_events(self, event_list):

FILE: benchmark/third_party/DeepSpeed/deepspeed/nebula/config.py
  class DeepSpeedNebulaConfig (line 10) | class DeepSpeedNebulaConfig(DeepSpeedConfigObject):
    method __init__ (line 11) | def __init__(self, param_dict):
    method _initialize (line 27) | def _initialize(self, nebula_dict):

FILE: benchmark/third_party/DeepSpeed/deepspeed/ops/adagrad/cpu_adagrad.py
  class DeepSpeedCPUAdagrad (line 10) | class DeepSpeedCPUAdagrad(torch.optim.Optimizer):
    method __init__ (line 13) | def __init__(self,
    method __del__ (line 35) | def __del__(self):
    method __setstate__ (line 40) | def __setstate__(self, state):
    method step (line 46) | def step(self, closure=None, fp16_param_groups=None):

FILE: benchmark/third_party/DeepSpeed/deepspeed/ops/adam/cpu_adam.py
  class DeepSpeedCPUAdam (line 12) | class DeepSpeedCPUAdam(torch.optim.Optimizer):
    method __init__ (line 15) | def __init__(self,
    method __del__ (line 105) | def __del__(self):
    method __setstate__ (line 110) | def __setstate__(self, state):
    method step (line 116) | def step(self, closure=None, fp16_param_groups=None):

FILE: benchmark/third_party/DeepSpeed/deepspeed/ops/adam/fused_adam.py
  class FusedAdam (line 15) | class FusedAdam(torch.optim.Optimizer):
    method __init__ (line 49) | def __init__(self,
    method zero_grad (line 77) | def zero_grad(self):
    method step (line 85) | def step(self,

FILE: benchmark/third_party/DeepSpeed/deepspeed/ops/adam/multi_tensor_apply.py
  class MultiTensorApply (line 9) | class MultiTensorApply(object):
    method __init__ (line 10) | def __init__(self, chunk_size):
    method __call__ (line 13) | def __call__(self, op, noop_flag_buffer, tensor_lists, *args):

FILE: benchmark/third_party/DeepSpeed/deepspeed/ops/lamb/fused_lamb.py
  class FusedLamb (line 12) | class FusedLamb(torch.optim.Optimizer):
    method __init__ (line 38) | def __init__(self,
    method step (line 67) | def step(self,
    method get_lamb_coeffs (line 187) | def get_lamb_coeffs(self):

FILE: benchmark/third_party/DeepSpeed/deepspeed/ops/quantizer/quantizer.py
  function ds_quantizer (line 12) | def ds_quantizer(input, groups=1, bit_num=8, sr=False, asym=False):

FILE: benchmark/third_party/DeepSpeed/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py
  class BertSparseSelfAttention (line 9) | class BertSparseSelfAttention(nn.Module):
    method __init__ (line 16) | def __init__(
    method transpose_for_scores (line 46) | def transpose_for_scores(self, x):
    method forward (line 52) | def forward(self, hidden_states, attention_mask):

FILE: benchmark/third_party/DeepSpeed/deepspeed/ops/sparse_attention/matmul.py
  function _kernel (line 12) | def _kernel(A,
  class _sparse_matmul (line 212) | class _sparse_matmul(torch.autograd.Function):
    method load_balance (line 224) | def load_balance(sizes, block):
    method get_locks (line 275) | def get_locks(size, dev):
    method make_sdd_lut (line 286) | def make_sdd_lut(layout, block, dtype, device):
    method _sdd_matmul (line 317) | def _sdd_matmul(a,
    method make_dxx_lut (line 427) | def make_dxx_lut(layout, block, step, trans, device, transform=lambda ...
    method _dds_matmul (line 521) | def _dds_matmul(a,
    method _dsd_matmul (line 593) | def _dsd_matmul(a,
    method forward (line 672) | def forward(ctx,
    method backward (line 734) | def backward(ctx, dc):
  class MatMul (line 777) | class MatMul:
    method make_lut (line 785) | def make_lut(self, dtype, device):
    method __init__ (line 819) | def __init__(self, layout, block, mode, trans_a=False, trans_b=False, ...
    method _pad_shape (line 873) | def _pad_shape(x, is_sparse):
    method __call__ (line 879) | def __call__(self, a, b):
    method _validate_inputs (line 946) | def _validate_inputs(self, a, b):

FILE: benchmark/third_party/DeepSpeed/deepspeed/ops/sparse_attention/softmax.py
  function next_power_of_2 (line 10) | def next_power_of_2(n):
  function num_warps (line 21) | def num_warps(n):
  function _forward (line 38) | def _forward(X,
  function _backward (line 112) | def _backward(X, scale, DX, LUT, sizemax, stride_zx, stride_zdx, **meta):
  class _sparse_softmax (line 142) | class _sparse_softmax(torch.autograd.Function):
    method make_lut (line 147) | def make_lut(layout, block, device):
    method forward (line 169) | def forward(ctx,
    method backward (line 245) | def backward(ctx, dx):
  class Softmax (line 263) | class Softmax:
    method sparse_softmax (line 271) | def sparse_softmax(*args, **kwargs):
    method make_lut (line 274) | def make_lut(self, device):
    method __init__ (line 284) | def __init__(self, layout, block, bench=False):
    method __call__ (line 300) | def __call__(self,

FILE: benchmark/third_party/DeepSpeed/deepspeed/ops/sparse_attention/sparse_attention_utils.py
  class SparseAttentionUtils (line 13) | class SparseAttentionUtils:
    method extend_position_embedding (line 19) | def extend_position_embedding(model, max_position):
    method update_tokenizer_model_max_length (line 68) | def update_tokenizer_model_max_length(tokenizer, max_position):
    method replace_model_self_attention_with_sparse_self_attention (line 85) | def replace_model_self_attention_with_sparse_self_attention(
    method replace_self_attention_layer_with_sparse_self_attention_layer (line 123) | def replace_self_attention_layer_with_sparse_self_attention_layer(
    method pad_to_block_size (line 151) | def pad_to_block_size(block_size,
    method unpad_sequence_output (line 210) | def unpad_sequence_output(pad_len, sequence_output):

FILE: benchmark/third_party/DeepSpeed/deepspeed/ops/sparse_attention/sparse_self_attention.py
  class SparseSelfAttention (line 11) | class SparseSelfAttention(nn.Module):
    method __init__ (line 18) | def __init__(
    method get_layout (line 48) | def get_layout(self, L):
    method get_ops (line 63) | def get_ops(self, H, L):
    method transpose_key_for_scores (line 87) | def transpose_key_for_scores(self, x, L):
    method transpose_mask_for_sparse (line 93) | def transpose_mask_for_sparse(self, qtype, x, is_key_padding_mask=False):
    method forward (line 103) | def forward(self,

FILE: benchmark/third_party/DeepSpeed/deepspeed/ops/sparse_attention/sparsity_config.py
  class SparsityConfig (line 9) | class SparsityConfig:
    method __init__ (line 13) | def __init__(self, num_heads, block=16, different_layout_per_head=False):
    method setup_layout (line 29) | def setup_layout(self, seq_len):
    method check_and_propagate_first_head_layout (line 48) | def check_and_propagate_first_head_layout(self, layout):
  class DenseSparsityConfig (line 63) | class DenseSparsityConfig(SparsityConfig):
    method __init__ (line 67) | def __init__(self, num_heads, block=16, different_layout_per_head=False):
    method make_layout (line 79) | def make_layout(self, seq_len):
  class FixedSparsityConfig (line 94) | class FixedSparsityConfig(SparsityConfig):
    method __init__ (line 99) | def __init__(self,
    method set_local_layout (line 154) | def set_local_layout(self, h, layout):
    method set_global_layout (line 175) | def set_global_layout(self, h, layout):
    method make_layout (line 224) | def make_layout(self, seq_len):
  class VariableSparsityConfig (line 243) | class VariableSparsityConfig(SparsityConfig):
    method __init__ (line 253) | def __init__(self,
    method set_random_layout (line 309) | def set_random_layout(self, h, layout):
    method set_local_layout (line 331) | def set_local_layout(self, h, layout):
    method set_global_layout (line 364) | def set_global_layout(self, h, layout):
    method make_layout (line 401) | def make_layout(self, seq_len):
  class BigBirdSparsityConfig (line 421) | class BigBirdSparsityConfig(SparsityConfig):
    method __init__ (line 426) | def __init__(self,
    method set_random_layout (line 459) | def set_random_layout(self, h, layout):
    method set_sliding_window_layout (line 486) | def set_sliding_window_layout(self, h, layout):
    method set_global_layout_itc (line 510) | def set_global_layout_itc(self, h, layout):
    method make_layout (line 539) | def make_layout(self, seq_len):
  class BSLongformerSparsityConfig (line 559) | class BSLongformerSparsityConfig(SparsityConfig):
    method __init__ (line 567) | def __init__(self,
    method set_sliding_window_layout (line 608) | def set_sliding_window_layout(self, h, layout):
    method set_global_layout (line 632) | def set_global_layout(self, h, layout):
    method make_layout (line 667) | def make_layout(self, seq_len):
  class LocalSlidingWindowSparsityConfig (line 686) | class LocalSlidingWindowSparsityConfig(SparsityConfig):
    method __init__ (line 690) | def __init__(self,
    method set_sliding_window_layout (line 708) | def set_sliding_window_layout(self, h, layout):
    method make_layout (line 731) | def make_layout(self, seq_len):

FILE: benchmark/third_party/DeepSpeed/deepspeed/ops/sparse_attention/trsrc/__init__.py
  function _build_file_index (line 5) | def _build_file_index(directory, suffix='.tr'):

FILE: benchmark/third_party/DeepSpeed/deepspeed/ops/transformer/inference/bias_add.py
  function nhwc_bias_add (line 13) | def nhwc_bias_add(activation: torch.Tensor,

FILE: benchmark/third_party/DeepSpeed/deepspeed/ops/transformer/inference/config.py
  class TransformerConfig (line 8) | class TransformerConfig():
    method __init__ (line 9) | def __init__(self, hidden_size, intermediate_size, heads, num_hidden_l...
  class DeepSpeedInferenceConfig (line 17) | class DeepSpeedInferenceConfig(TransformerConfig):
    method __init__ (line 43) | def __init__(self,
    method from_dict (line 99) | def from_dict(cls, json_object):
    method from_json_file (line 106) | def from_json_file(cls, json_file):

FILE: benchmark/third_party/DeepSpeed/deepspeed/ops/transformer/inference/diffusers_2d_transformer.py
  class Diffusers2DTransformerConfig (line 6) | class Diffusers2DTransformerConfig():
    method __init__ (line 7) | def __init__(self, int8_quantization=False):

FILE: benchmark/third_party/DeepSpeed/deepspeed/ops/transformer/inference/diffusers_attention.py
  function load_triton_flash_attn (line 17) | def load_triton_flash_attn():
  class DeepSpeedDiffusersAttentionFunction (line 30) | class DeepSpeedDiffusersAttentionFunction(Function):
    method forward (line 32) | def forward(ctx,
    method backward (line 120) | def backward(ctx, grad_output, grad_output1, grad_output2, grad_output3):
  class DeepSpeedDiffusersAttention (line 125) | class DeepSpeedDiffusersAttention(nn.Module):
    method __init__ (line 134) | def __init__(
    method forward (line 214) | def forward(self, input, context=None, input_mask=None):

FILE: benchmark/third_party/DeepSpeed/deepspeed/ops/transformer/inference/diffusers_transformer_block.py
  function load_transformer_module (line 19) | def load_transformer_module():
  function load_spatial_module (line 26) | def load_spatial_module():
  class DeepSpeedDiffusersTransformerBlock (line 33) | class DeepSpeedDiffusersTransformerBlock(nn.Module):
    method __init__ (line 34) | def __init__(self,
    method forward (line 93) | def forward(self, hidden_states, context=None, timestep=None):

FILE: benchmark/third_party/DeepSpeed/deepspeed/ops/transformer/inference/ds_attention.py
  class DeepSpeedSelfAttentionFunction (line 16) | class DeepSpeedSelfAttentionFunction(Function):
    method forward (line 18) | def forward(ctx,
    method backward (line 375) | def backward(ctx, grad_output, grad_output1, grad_output2, grad_output3):
  class DeepSpeedSelfAttention (line 380) | class DeepSpeedSelfAttention(nn.Module):
    method __init__ (line 383) | def __init__(self,
    method forward (line 446) | def forward(self,

FILE: benchmark/third_party/DeepSpeed/deepspeed/ops/transformer/inference/ds_mlp.py
  class DeepSpeedMLPFunction (line 16) | class DeepSpeedMLPFunction(Function):
    method forward (line 18) | def forward(ctx,
    method backward (line 84) | def backward(ctx, grad_output):
  class DeepSpeedMLP (line 89) | class DeepSpeedMLP(nn.Module):
    method __init__ (line 90) | def __init__(self,
    method forward (line 156) | def forward(self, input, residual, residual_norm, bias):

FILE: benchmark/third_party/DeepSpeed/deepspeed/ops/transformer/inference/moe_inference.py
  class DeepSpeedMoEInferenceConfig (line 20) | class DeepSpeedMoEInferenceConfig(DeepSpeedInferenceConfig):
    method __init__ (line 45) | def __init__(self,
    method from_dict (line 104) | def from_dict(cls, json_object):
    method from_json_file (line 111) | def from_json_file(cls, json_file):
  class DeepSpeedMLPFunction (line 117) | class DeepSpeedMLPFunction(Function):
    method forward (line 119) | def forward(ctx,
    method backward (line 162) | def backward(ctx, grad_output):
  class DeepSpeedMoEMLP (line 167) | class DeepSpeedMoEMLP(nn.Module):
    method __init__ (line 168) | def __init__(self,
    method forward (line 194) | def forward(self, input, async_op=False):
  class DeepSpeedMoEInference (line 208) | class DeepSpeedMoEInference(nn.Module):
    method __init__ (line 226) | def __init__(self,
    method res_coef_func (line 313) | def res_coef_func(self, inp, async_op):
    method moe_gate_einsum (line 317) | def moe_gate_einsum(self, attention_output):
    method expert_exec (line 328) | def expert_exec(self, dispatched_input):
    method _alltoall (line 349) | def _alltoall(self, dispatched_attention):
    method scale_expert_output (line 359) | def scale_expert_output(self, attention_output, expert_output, combine...
    method forward (line 368) | def forward(self,

FILE: benchmark/third_party/DeepSpeed/deepspeed/ops/transformer/inference/triton_ops.py
  function _fwd_kernel (line 12) | def _fwd_kernel(
  class triton_flash_attn (line 103) | class triton_flash_attn(torch.nn.Module):
    method __init__ (line 104) | def __init__(self, ):
    method forward (line 107) | def forward(self, q, k, v, sm_scale, block_128=True):

FILE: benchmark/third_party/DeepSpeed/deepspeed/ops/transformer/transformer.py
  class TransformerConfig (line 17) | class TransformerConfig():
    method __init__ (line 18) | def __init__(self,
  class DeepSpeedTransformerConfig (line 38) | class DeepSpeedTransformerConfig(TransformerConfig):
    method __init__ (line 92) | def __init__(self,
    method from_dict (line 139) | def from_dict(cls, json_object):
    method from_json_file (line 146) | def from_json_file(cls, json_file):
  class DeepSpeedTransformerFunction (line 152) | class DeepSpeedTransformerFunction(Function):
    method forward (line 154) | def forward(ctx,
    method backward (line 324) | def backward(ctx, grad_output):
  class DeepSpeedTransformerLayer (line 459) | class DeepSpeedTransformerLayer(nn.Module):
    method __init__ (line 474) | def __init__(self, config, initial_weights=None, initial_biases=None):
    method init_transformer_weights (line 558) | def init_transformer_weights(self, adjust_init_range=False):
    method forward (line 578) | def forward(self,

FILE: benchmark/third_party/DeepSpeed/deepspeed/profiling/config.py
  class DeepSpeedFlopsProfilerConfig (line 10) | class DeepSpeedFlopsProfilerConfig(DeepSpeedConfigObject):
    method __init__ (line 11) | def __init__(self, param_dict):
    method _initialize (line 26) | def _initialize(self, flops_profiler_dict):

FILE: benchmark/third_party/DeepSpeed/deepspeed/profiling/flops_profiler/profiler.py
  class FlopsProfiler (line 17) | class FlopsProfiler(object):
    method __init__ (line 53) | def __init__(self, model, ds_engine=None):
    method start_profile (line 59) | def start_profile(self, ignore_list=None):
    method stop_profile (line 120) | def stop_profile(self):
    method reset_profile (line 149) | def reset_profile(self):
    method end_profile (line 163) | def end_profile(self):
    method get_total_flops (line 187) | def get_total_flops(self, as_string=False):
    method get_total_macs (line 199) | def get_total_macs(self, as_string=False):
    method get_total_duration (line 211) | def get_total_duration(self, as_string=False):
    method get_total_params (line 223) | def get_total_params(self, as_string=False):
    method print_model_profile (line 235) | def print_model_profile(self,
    method print_model_aggregated_profile (line 402) | def print_model_aggregated_profile(self, module_depth=-1, top_modules=1):
  function _prod (line 474) | def _prod(dims):
  function _linear_flops_compute (line 481) | def _linear_flops_compute(input, weight, bias=None):
  function _relu_flops_compute (line 487) | def _relu_flops_compute(input, inplace=False):
  function _prelu_flops_compute (line 491) | def _prelu_flops_compute(input: Tensor, weight: Tensor):
  function _elu_flops_compute (line 495) | def _elu_flops_compute(input: Tensor, alpha: float = 1.0, inplace: bool ...
  function _leaky_relu_flops_compute (line 499) | def _leaky_relu_flops_compute(input: Tensor,
  function _relu6_flops_compute (line 505) | def _relu6_flops_compute(input: Tensor, inplace: bool = False):
  function _silu_flops_compute (line 509) | def _silu_flops_compute(input: Tensor, inplace: bool = False):
  function _gelu_flops_compute (line 513) | def _gelu_flops_compute(input):
  function _pool_flops_compute (line 517) | def _pool_flops_compute(input,
  function _conv_flops_compute (line 529) | def _conv_flops_compute(input,
  function _conv_trans_flops_compute (line 569) | def _conv_trans_flops_compute(
  function _batch_norm_flops_compute (line 615) | def _batch_norm_flops_compute(
  function _layer_norm_flops_compute (line 633) | def _layer_norm_flops_compute(
  function _group_norm_flops_compute (line 645) | def _group_norm_flops_compute(input: Tensor,
  function _instance_norm_flops_compute (line 655) | def _instance_norm_flops_compute(
  function _upsample_flops_compute (line 670) | def _upsample_flops_compute(input,
  function _softmax_flops_compute (line 689) | def _softmax_flops_compute(input, dim=None, _stacklevel=3, dtype=None):
  function _embedding_flops_compute (line 693) | def _embedding_flops_compute(
  function _dropout_flops_compute (line 705) | def _dropout_flops_compute(input, p=0.5, training=True, inplace=False):
  function _matmul_flops_compute (line 709) | def _matmul_flops_compute(input, other, *, out=None):
  function _addmm_flops_compute (line 717) | def _addmm_flops_compute(input, mat1, mat2, *, beta=1, alpha=1, out=None):
  function _einsum_flops_compute (line 725) | def _einsum_flops_compute(equation, *operands):
  function _tensor_addmm_flops_compute (line 747) | def _tensor_addmm_flops_compute(self, mat1, mat2, *, beta=1, alpha=1, ou...
  function _mul_flops_compute (line 755) | def _mul_flops_compute(input, other, *, out=None):
  function _add_flops_compute (line 759) | def _add_flops_compute(input, other, *, alpha=1, out=None):
  function _elementwise_flops_compute (line 763) | def _elementwise_flops_compute(input, other):
  function wrapFunc (line 788) | def wrapFunc(func, funcFlopCompute):
  function _patch_functionals (line 806) | def _patch_functionals():
  function _patch_tensor_methods (line 861) | def _patch_tensor_methods():
  function _reload_functionals (line 881) | def _reload_functionals():
  function _reload_tensor_methods (line 914) | def _reload_tensor_methods():
  function _rnn_flops (line 918) | def _rnn_flops(flops, rnn_module, w_ih, w_hh, input_size):
  function _rnn_forward_hook (line 943) | def _rnn_forward_hook(rnn_module, input, output):
  function _rnn_cell_forward_hook (line 971) | def _rnn_cell_forward_hook(rnn_cell_module, input, output):
  function num_to_string (line 999) | def num_to_string(num, precision=2):
  function macs_to_string (line 1010) | def macs_to_string(macs, units=None, precision=2):
  function number_to_string (line 1031) | def number_to_string(num, units=None, precision=2):
  function flops_to_string (line 1052) | def flops_to_string(flops, units=None, precision=2):
  function params_to_string (line 1077) | def params_to_string(params_num, units=None, precision=2):
  function duration_to_string (line 1094) | def duration_to_string(duration, units=None, precision=2):
  function get_module_flops (line 1115) | def get_module_flops(module):
  function get_module_macs (line 1123) | def get_module_macs(module):
  function get_module_duration (line 1131) | def get_module_duration(module):
  function get_model_profile (line 1139) | def get_model_profile(

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/__init__.py
  class DeepSpeedOptimizer (line 1) | class DeepSpeedOptimizer(object):
  class ZeROOptimizer (line 5) | class ZeROOptimizer(DeepSpeedOptimizer):

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/activation_checkpointing/checkpointing.py
  function detach_variable (line 64) | def detach_variable(inputs, device=None):
  function _set_cuda_rng_state (line 89) | def _set_cuda_rng_state(new_state, device=-1):
  class CudaRNGStatesTracker (line 122) | class CudaRNGStatesTracker:
    method __init__ (line 130) | def __init__(self):
    method reset (line 136) | def reset(self):
    method get_states (line 141) | def get_states(self):
    method set_states (line 146) | def set_states(self, states):
    method add (line 151) | def add(self, name, seed):
    method fork (line 169) | def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME):
  function get_cuda_rng_tracker (line 193) | def get_cuda_rng_tracker():
  function model_parallel_cuda_manual_seed (line 198) | def model_parallel_cuda_manual_seed(seed):
  function get_partition_start (line 243) | def get_partition_start(item):
  function get_partition_size (line 251) | def get_partition_size(item):
  function gather_partitioned_activations (line 259) | def gather_partitioned_activations(tensors, device=None):
  function extract_tensors (line 303) | def extract_tensors(all_objects):
  function merge_tensors (line 323) | def merge_tensors(tensor_objects, non_tensor_objects, tensor_flags):
  function is_activation_to_checkpoint (line 365) | def is_activation_to_checkpoint(item):
  function partition_activations (line 373) | def partition_activations(args, cpu_checkpoint, contiguous_checkpoint):
  function get_partitioned_activations_for_backward (line 439) | def get_partitioned_activations_for_backward(args, inputs, contiguous_ch...
  function get_cpu_activations_for_backward (line 486) | def get_cpu_activations_for_backward(args, inputs):
  class CheckpointFunction (line 499) | class CheckpointFunction(torch.autograd.Function):
    method forward (line 510) | def forward(ctx, run_function, all_outputs, *args):
    method backward (line 630) | def backward(ctx, *grads):
  function checkpoint (line 749) | def checkpoint(function, *args):
  function partition_activations_in_checkpoint (line 761) | def partition_activations_in_checkpoint(partition_activation):
  function set_num_layers (line 769) | def set_num_layers(nlayers):
  function reset (line 774) | def reset():
  function _configure_using_config_file (line 800) | def _configure_using_config_file(config, mpu=None):
  function _configure_defaults (line 815) | def _configure_defaults():
  function configure (line 831) | def configure(
  function is_configured (line 913) | def is_configured():

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/activation_checkpointing/config.py
  class DeepSpeedActivationCheckpointingConfig (line 59) | class DeepSpeedActivationCheckpointingConfig(DeepSpeedConfigObject):
    method __init__ (line 60) | def __init__(self, param_dict):
    method _initialize (line 77) | def _initialize(self, act_chkpt_config_dict):

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/bf16_optimizer.py
  class BF16_Optimizer (line 38) | class BF16_Optimizer(ZeROOptimizer):
    method __init__ (line 39) | def __init__(self,
    method _setup_for_real_optimizer (line 95) | def _setup_for_real_optimizer(self):
    method _enable_universal_checkpoint (line 189) | def _enable_universal_checkpoint(self):
    method _create_param_mapping (line 193) | def _create_param_mapping(self):
    method _link_all_hp_params (line 206) | def _link_all_hp_params(self):
    method initialize_optimizer_states (line 221) | def initialize_optimizer_states(self):
    method _split_flat_tensor (line 235) | def _split_flat_tensor(self, flat_tensor, num_elem_list):
    method _update_storage_to_flattened_tensor (line 246) | def _update_storage_to_flattened_tensor(self, tensor_list, flat_tensor):
    method _flatten_dense_tensors_aligned (line 251) | def _flatten_dense_tensors_aligned(self, tensor_list, alignment):
    method step (line 255) | def step(self, closure=None):
    method backward (line 280) | def backward(self, loss, update_hp_grads=True, clear_lp_grads=False, *...
    method update_hp_grads (line 296) | def update_hp_grads(self, clear_lp_grads=False):
    method get_grads_for_reduction (line 315) | def get_grads_for_reduction(self):
    method get_grads_for_norm (line 319) | def get_grads_for_norm(self, for_clipping=False):
    method update_lp_params (line 339) | def update_lp_params(self):
    method clear_hp_grads (line 352) | def clear_hp_grads(self):
    method clear_lp_grads (line 359) | def clear_lp_grads(self):
    method state_dict (line 364) | def state_dict(self):
    method _restore_from_bit16_weights (line 377) | def _restore_from_bit16_weights(self):
    method refresh_fp32_params (line 383) | def refresh_fp32_params(self):
    method load_state_dict (line 386) | def load_state_dict(self,
    method _load_legacy_checkpoint (line 400) | def _load_legacy_checkpoint(self,
    method _load_universal_checkpoint (line 425) | def _load_universal_checkpoint(self,
    method param_groups (line 432) | def param_groups(self):
    method _load_hp_checkpoint_state (line 436) | def _load_hp_checkpoint_state(self, checkpoint_dir):
  function _get_padded_tensor (line 452) | def _get_padded_tensor(src_tensor, size):

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/checkpoint_engine/checkpoint_engine.py
  class CheckpointEngine (line 1) | class CheckpointEngine(object):
    method __init__ (line 4) | def __init__(self, config_params=None):
    method create (line 7) | def create(self, tag):
    method save (line 11) | def save(self, state_dict, path: str):
    method load (line 14) | def load(self, path: str, map_location=None):
    method commit (line 17) | def commit(self, tag):

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py
  function _get_tag_from_path (line 11) | def _get_tag_from_path(path):
  class NebulaCheckpointEngine (line 15) | class NebulaCheckpointEngine(CheckpointEngine):
    method __init__ (line 16) | def __init__(self, config_params=None):
    method create (line 33) | def create(self, tag):
    method save (line 39) | def save(self, state_dict, path: str):
    method load (line 50) | def load(self, path: str, map_location=None):
    method commit (line 103) | def commit(self, tag):

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py
  class TorchCheckpointEngine (line 7) | class TorchCheckpointEngine(CheckpointEngine):
    method __init__ (line 8) | def __init__(self, config_params=None):
    method create (line 11) | def create(self, tag):
    method save (line 14) | def save(self, state_dict, path: str):
    method load (line 20) | def load(self, path: str, map_location=None):
    method commit (line 26) | def commit(self, tag):

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/comm/coalesced_collectives.py
  function _torch_reduce_scatter_fn (line 17) | def _torch_reduce_scatter_fn(input_tensor: Tensor,
  function reduce_scatter_coalesced (line 30) | def reduce_scatter_coalesced(

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/comm/mpi.py
  class MpiBackend (line 14) | class MpiBackend(object):
    method __init__ (line 15) | def __init__(self, cuda_aware):
    method my_igather (line 22) | def my_igather(self, rank, size, comm, sendbuf, recbuf, root):
    method gather_cuda (line 34) | def gather_cuda(self,
    method gather_host (line 64) | def gather_host(self,
    method allgather_cuda (line 125) | def allgather_cuda(self,
    method allgather_host (line 134) | def allgather_host(self,
    method compressed_allreduce (line 170) | def compressed_allreduce(self,

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/comm/nccl.py
  class NcclBackend (line 13) | class NcclBackend(object):
    method __init__ (line 14) | def __init__(self, mpu=None):
    method my_igather (line 29) | def my_igather(self, rank, size, group, sendbuf, recvbuf, root):
    method my_gather (line 41) | def my_gather(self, rank, size, group, sendbuf, recvbuf, root):
    method compressed_allreduce (line 51) | def compressed_allreduce(self,

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/compression/cupy.py
  class CupyBackend (line 10) | class CupyBackend(object):
    method __init__ (line 11) | def __init__(self):
    method torch2cupy (line 14) | def torch2cupy(self, tensor):
    method cupy2torch (line 17) | def cupy2torch(self, cupy_tensor):
    method compress_by_chunk (line 20) | def compress_by_chunk(self, cupy_bool_tensor, num_chunks):

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/config.py
  class DeepSpeedConfigError (line 86) | class DeepSpeedConfigError(Exception):
  function get_curriculum_enabled (line 90) | def get_curriculum_enabled(param_dict):
  function get_curriculum_params (line 99) | def get_curriculum_params(param_dict):
  function get_pld_enabled (line 108) | def get_pld_enabled(param_dict):
  function get_pld_params (line 117) | def get_pld_params(param_dict):
  function get_amp_enabled (line 126) | def get_amp_enabled(param_dict):
  function get_amp_params (line 133) | def get_amp_params(param_dict):
  function get_fp16_enabled (line 142) | def get_fp16_enabled(param_dict):
  function get_bfloat16_enabled (line 149) | def get_bfloat16_enabled(param_dict):
  function get_fp16_master_weights_and_grads_enabled (line 158) | def get_fp16_master_weights_and_grads_enabled(param_dict):
  function get_fp16_auto_cast (line 167) | def get_fp16_auto_cast(param_dict):
  function get_loss_scale (line 172) | def get_loss_scale(param_dict):
  function get_initial_dynamic_scale (line 183) | def get_initial_dynamic_scale(param_dict):
  function get_dynamic_loss_scale_args (line 196) | def get_dynamic_loss_scale_args(param_dict):
  function get_gradient_accumulation_steps (line 229) | def get_gradient_accumulation_steps(param_dict):
  function get_sparse_gradients_enabled (line 235) | def get_sparse_gradients_enabled(param_dict):
  function get_communication_data_type (line 239) | def get_communication_data_type(param_dict):
  function get_prescale_gradients (line 258) | def get_prescale_gradients(param_dict):
  function get_gradient_predivide_factor (line 262) | def get_gradient_predivide_factor(param_dict):
  function get_steps_per_print (line 268) | def get_steps_per_print(param_dict):
  function get_disable_allgather (line 272) | def get_disable_allgather(param_dict):
  function get_dump_state (line 276) | def get_dump_state(param_dict):
  function get_gradient_clipping (line 280) | def get_gradient_clipping(param_dict):
  function get_sparse_attention (line 284) | def get_sparse_attention(param_dict):
  function get_sparse_dense_config (line 307) | def get_sparse_dense_config(sparsity):
  function get_sparse_fixed_config (line 312) | def get_sparse_fixed_config(sparsity):
  function get_sparse_variable_config (line 351) | def get_sparse_variable_config(sparsity):
  function get_sparse_bigbird_config (line 394) | def get_sparse_bigbird_config(sparsity):
  function get_sparse_bslongformer_config (line 423) | def get_sparse_bslongformer_config(sparsity):
  function get_sparse_attention_mode (line 454) | def get_sparse_attention_mode(param_dict):
  function get_sparse_attention_type (line 461) | def get_sparse_attention_type(param_dict):
  function get_pipeline_config (line 468) | def get_pipeline_config(param_dict):
  function get_optimizer_name (line 482) | def get_optimizer_name(param_dict):
  function get_optimizer_params (line 489) | def get_optimizer_params(param_dict):
  function get_optimizer_gradient_clipping (line 497) | def get_optimizer_gradient_clipping(param_dict):
  function get_optimizer_legacy_fusion (line 505) | def get_optimizer_legacy_fusion(param_dict):
  function get_zero_allow_untested_optimizer (line 512) | def get_zero_allow_untested_optimizer(param_dict):
  function get_scheduler_name (line 518) | def get_scheduler_name(param_dict):
  function get_scheduler_params (line 525) | def get_scheduler_params(param_dict):
  function get_train_batch_size (line 533) | def get_train_batch_size(param_dict):
  function get_train_micro_batch_size_per_gpu (line 537) | def get_train_micro_batch_size_per_gpu(param_dict):
  function get_wall_clock_breakdown (line 545) | def get_wall_clock_breakdown(param_dict):
  function get_memory_breakdown (line 551) | def get_memory_breakdown(param_dict):
  function get_eigenvalue_config (line 555) | def get_eigenvalue_config(param_dict):
  function get_eigenvalue_enabled (line 582) | def get_eigenvalue_enabled(param_dict):
  function get_eigenvalue_verbose (line 591) | def get_eigenvalue_verbose(param_dict):
  function get_eigenvalue_max_iter (line 600) | def get_eigenvalue_max_iter(param_dict):
  function get_eigenvalue_tol (line 609) | def get_eigenvalue_tol(param_dict):
  function get_eigenvalue_stability (line 618) | def get_eigenvalue_stability(param_dict):
  function get_eigenvalue_gas_boundary_resolution (line 627) | def get_eigenvalue_gas_boundary_resolution(param_dict):
  function get_eigenvalue_layer_name (line 638) | def get_eigenvalue_layer_name(param_dict):
  function get_eigenvalue_layer_num (line 647) | def get_eigenvalue_layer_num(param_dict):
  function get_checkpoint_params (line 656) | def get_checkpoint_params(param_dict):
  function get_data_types_params (line 660) | def get_data_types_params(param_dict):
  function get_checkpoint_tag_validation_mode (line 664) | def get_checkpoint_tag_validation_mode(checkpoint_params):
  function get_checkpoint_parallel_write_pipeline (line 677) | def get_checkpoint_parallel_write_pipeline(checkpoint_params):
  function get_dataloader_drop_last (line 690) | def get_dataloader_drop_last(param_dict):
  class DeepSpeedConfigWriter (line 700) | class DeepSpeedConfigWriter:
    method __init__ (line 701) | def __init__(self, data=None):
    method add_config (line 704) | def add_config(self, key, value):
    method load_config (line 707) | def load_config(self, filename):
    method write_config (line 712) | def write_config(self, filename):
  class DeepSpeedConfig (line 717) | class DeepSpeedConfig(object):
    method __init__ (line 718) | def __init__(self, config: Union[str, dict], mpu=None):
    method _initialize_params (line 824) | def _initialize_params(self, param_dict):
    method _batch_assertion (line 930) | def _batch_assertion(self):
    method _set_batch_related_parameters (line 954) | def _set_batch_related_parameters(self):
    method _configure_train_batch_size (line 999) | def _configure_train_batch_size(self):
    method _do_sanity_check (line 1003) | def _do_sanity_check(self):
    method print_user_config (line 1008) | def print_user_config(self):
    method print (line 1019) | def print(self, name):
    method _do_error_check (line 1028) | def _do_error_check(self):
    method _do_warning_check (line 1047) | def _do_warning_check(self):

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/config_utils.py
  class DeepSpeedConfigModel (line 16) | class DeepSpeedConfigModel(BaseModel):
    method __init__ (line 50) | def __init__(self, strict=False, **data):
    method _process_deprecated_field (line 58) | def _process_deprecated_field(self, pydantic_config, field):
    method _deprecated_fields_check (line 100) | def _deprecated_fields_check(self, pydantic_config):
    class Config (line 106) | class Config:
  class pp_int (line 114) | class pp_int(int):
    method __new__ (line 120) | def __new__(cls, val, custom_print_str=None):
    method __repr__ (line 125) | def __repr__(self):
  class ScientificNotationEncoder (line 132) | class ScientificNotationEncoder(json.JSONEncoder):
    method iterencode (line 141) | def iterencode(self, o, _one_shot=False, level=0):
  class DeepSpeedConfigObject (line 164) | class DeepSpeedConfigObject(object):
    method repr (line 168) | def repr(self):
    method __repr__ (line 171) | def __repr__(self):
  function get_scalar_param (line 180) | def get_scalar_param(param_dict, param_name, param_default_value):
  function get_list_param (line 184) | def get_list_param(param_dict, param_name, param_default_value):
  function get_dict_param (line 188) | def get_dict_param(param_dict, param_name, param_default_value):
  function dict_raise_error_on_duplicate_keys (line 192) | def dict_raise_error_on_duplicate_keys(ordered_pairs):

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/constants.py
  class ValidationMode (line 358) | class ValidationMode:

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/data_pipeline/curriculum_scheduler.py
  class CurriculumScheduler (line 8) | class CurriculumScheduler(object):
    method __init__ (line 9) | def __init__(self, config):
    method get_current_difficulty (line 87) | def get_current_difficulty(self):
    method set_current_difficulty (line 90) | def set_current_difficulty(self, difficulty):
    method get_state (line 93) | def get_state(self):
    method set_state (line 96) | def set_state(self, state):
    method __fixed_discrete_get_difficulty (line 99) | def __fixed_discrete_get_difficulty(self, global_steps):
    method __fixed_root_get_difficulty (line 107) | def __fixed_root_get_difficulty(self, global_steps, root_degree=None):
    method get_difficulty (line 121) | def get_difficulty(self, global_steps):
    method update_difficulty (line 131) | def update_difficulty(self, global_steps):

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/dataloader.py
  class RepeatingLoader (line 10) | class RepeatingLoader:
    method __init__ (line 11) | def __init__(self, loader):
    method __iter__ (line 21) | def __iter__(self):
    method __next__ (line 24) | def __next__(self):
  class DeepSpeedDataLoader (line 33) | class DeepSpeedDataLoader(object):
    method __init__ (line 34) | def __init__(self,
    method __iter__ (line 80) | def __iter__(self):
    method __len__ (line 84) | def __len__(self):
    method __next__ (line 87) | def __next__(self):
    method _create_dataloader (line 92) | def _create_dataloader(self):

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/eigenvalue.py
  class Eigenvalue (line 7) | class Eigenvalue(object):
    method __init__ (line 8) | def __init__(self,
    method nan_to_num (line 34) | def nan_to_num(self, x):
    method normalize (line 40) | def normalize(self, v):
    method inner_product (line 47) | def inner_product(self, xs, ys):
    method get_layers (line 50) | def get_layers(self, module):
    method compute_eigenvalue (line 61) | def compute_eigenvalue(self, module, device=None, scale=1.0):
    method post_process (line 150) | def post_process(self, value_list):

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/engine.py
  function split_half_float_double_sparse (line 103) | def split_half_float_double_sparse(tensors):
  function print_configuration (line 123) | def print_configuration(args, name):
  class EngineTimers (line 142) | class EngineTimers(object):
    method __init__ (line 144) | def __init__(self, enable_micro_timers, enable_global_timers):
  class DeepSpeedEngine (line 182) | class DeepSpeedEngine(Module):
    method __init__ (line 184) | def __init__(
    method destroy (line 381) | def destroy(self):
    method _get_model_parameters (line 385) | def _get_model_parameters(self):
    method get_batch_info (line 409) | def get_batch_info(self):
    method set_train_batch_size (line 425) | def set_train_batch_size(self, train_batch_size):
    method get_global_grad_norm (line 446) | def get_global_grad_norm(self) -> float:
    method __getattr__ (line 458) | def __getattr__(self, name):
    method checkpoint_tag_validation_enabled (line 474) | def checkpoint_tag_validation_enabled(self):
    method checkpoint_tag_validation_fail (line 477) | def checkpoint_tag_validation_fail(self):
    method elasticity_enabled (line 480) | def elasticity_enabled(self):
    method is_elastic_model_parallel_supported (line 483) | def is_elastic_model_parallel_supported(self):
    method pld_enabled (line 491) | def pld_enabled(self):
    method pld_params (line 494) | def pld_params(self):
    method pld_theta (line 497) | def pld_theta(self):
    method pld_gamma (line 500) | def pld_gamma(self):
    method eigenvalue_enabled (line 503) | def eigenvalue_enabled(self):
    method eigenvalue_verbose (line 506) | def eigenvalue_verbose(self):
    method eigenvalue_max_iter (line 509) | def eigenvalue_max_iter(self):
    method eigenvalue_tol (line 512) | def eigenvalue_tol(self):
    method eigenvalue_stability (line 515) | def eigenvalue_stability(self):
    method eigenvalue_gas_boundary_resolution (line 518) | def eigenvalue_gas_boundary_resolution(self):
    method eigenvalue_layer_name (line 521) | def eigenvalue_layer_name(self):
    method eigenvalue_layer_num (line 524) | def eigenvalue_layer_num(self):
    method curriculum_enabled (line 527) | def curriculum_enabled(self):
    method curriculum_params (line 530) | def curriculum_params(self):
    method wall_clock_breakdown (line 533) | def wall_clock_breakdown(self):
    method flops_profiler_enabled (line 536) | def flops_profiler_enabled(self):
    method flops_profiler_profile_step (line 539) | def flops_profiler_profile_step(self):
    method flops_profiler_module_depth (line 545) | def flops_profiler_module_depth(self):
    method flops_profiler_top_modules (line 548) | def flops_profiler_top_modules(self):
    method flops_profiler_detailed (line 551) | def flops_profiler_detailed(self):
    method flops_profiler_output_file (line 556) | def flops_profiler_output_file(self):
    method memory_breakdown (line 559) | def memory_breakdown(self):
    method autotuning_enabled (line 562) | def autotuning_enabled(self):
    method autotuning_start_profile_step (line 565) | def autotuning_start_profile_step(self):
    method autotuning_end_profile_step (line 568) | def autotuning_end_profile_step(self):
    method autotuning_metric_path (line 571) | def autotuning_metric_path(self):
    method autotuning_model_info_path (line 577) | def autotuning_model_info_path(self):
    method autotuning_metric (line 583) | def autotuning_metric(self):
    method autotuning_profile_model_info (line 586) | def autotuning_profile_model_info(self):
    method sparse_gradients_enabled (line 592) | def sparse_gradients_enabled(self):
    method train_batch_size (line 595) | def train_batch_size(self):
    method train_micro_batch_size_per_gpu (line 598) | def train_micro_batch_size_per_gpu(self):
    method optimizer_name (line 601) | def optimizer_name(self):
    method optimizer_params (line 605) | def optimizer_params(self):
    method optimizer_legacy_fusion (line 608) | def optimizer_legacy_fusion(self):
    method scheduler_name (line 611) | def scheduler_name(self):
    method scheduler_params (line 614) | def scheduler_params(self):
    method quantize_training (line 617) | def quantize_training(self):
    method zero_optimization (line 639) | def zero_optimization(self):
    method zero_allow_untested_optimizer (line 642) | def zero_allow_untested_optimizer(self):
    method zero_reduce_scatter (line 645) | def zero_reduce_scatter(self):
    method zero_overlap_comm (line 648) | def zero_overlap_comm(self):
    method zero_offload_optimizer (line 651) | def zero_offload_optimizer(self):
    method zero_offload_param (line 654) | def zero_offload_param(self):
    method zero_use_cpu_optimizer (line 657) | def zero_use_cpu_optimizer(self):
    method zero_cpu_offload (line 665) | def zero_cpu_offload(self):
    method zero_sub_group_size (line 670) | def zero_sub_group_size(self):
    method zero_optimization_stage (line 673) | def zero_optimization_stage(self):
    method zero_reduce_bucket_size (line 676) | def zero_reduce_bucket_size(self):
    method zero_allgather_bucket_size (line 679) | def zero_allgather_bucket_size(self):
    method zero_optimization_partition_gradients (line 682) | def zero_optimization_partition_gradients(self):
    method zero_optimization_partition_weights (line 685) | def zero_optimization_partition_weights(self):
    method zero_contiguous_gradients (line 688) | def zero_contiguous_gradients(self):
    method zero_load_from_fp32_weights (line 691) | def zero_load_from_fp32_weights(self):
    method zero_elastic_checkpoint (line 694) | def zero_elastic_checkpoint(self):
    method zero_max_live_parameters (line 697) | def zero_max_live_parameters(self):
    method zero_max_reuse_distance (line 700) | def zero_max_reuse_distance(self):
    method zero_prefetch_bucket_size (line 703) | def zero_prefetch_bucket_size(self):
    method zero_param_persistence_threshold (line 706) | def zero_param_persistence_threshold(self):
    method zero_model_persistence_threshold (line 709) | def zero_model_persistence_threshold(self):
    method zero_gather_16bit_weights_on_model_save (line 712) | def zero_gather_16bit_weights_on_model_save(self):
    method zero_grad_hooks (line 715) | def zero_grad_hooks(self):
    method zero_legacy_stage1 (line 718) | def zero_legacy_stage1(self):
    method zero_ignore_unused_parameters (line 721) | def zero_ignore_unused_parameters(self):
    method fp16_enabled (line 724) | def fp16_enabled(self):
    method bfloat16_enabled (line 727) | def bfloat16_enabled(self):
    method fp16_master_weights_and_gradients (line 730) | def fp16_master_weights_and_gradients(self):
    method amp_enabled (line 733) | def amp_enabled(self):
    method amp_params (line 736) | def amp_params(self):
    method fp16_auto_cast (line 739) | def fp16_auto_cast(self):
    method loss_scale (line 742) | def loss_scale(self):
    method gradient_accumulation_steps (line 745) | def gradient_accumulation_steps(self):
    method use_node_local_storage (line 748) | def use_node_local_storage(self):
    method load_universal_checkpoint (line 751) | def load_universal_checkpoint(self):
    method communication_data_type (line 755) | def communication_data_type(self):
    method postscale_gradients (line 766) | def postscale_gradients(self):
    method gradient_predivide_factor (line 769) | def gradient_predivide_factor(self):
    method steps_per_print (line 772) | def steps_per_print(self):
    method zero_allgather_partitions (line 775) | def zero_allgather_partitions(self):
    method zero_round_robin_gradients (line 778) | def zero_round_robin_gradients(self):
    method dump_state (line 781) | def dump_state(self):
    method gradient_clipping (line 784) | def gradient_clipping(self):
    method dynamic_loss_scale (line 787) | def dynamic_loss_scale(self):
    method initial_dynamic_scale (line 790) | def initial_dynamic_scale(self):
    method dynamic_loss_scale_args (line 793) | def dynamic_loss_scale_args(self):
    method swap_tensor_config (line 796) | def swap_tensor_config(self):
    method aio_config (line 799) | def aio_config(self):
    method get_data_types (line 802) | def get_data_types(self):
    method _configure_lr_scheduler (line 819) | def _configure_lr_scheduler(self, client_lr_scheduler):
    method _configure_checkpointing (line 838) | def _configure_checkpointing(self, dist_init_required):
    method _scheduler_from_config (line 872) | def _scheduler_from_config(self, optimizer):
    method _set_distributed_vars (line 890) | def _set_distributed_vars(self, args):
    method _configure_with_arguments (line 905) | def _configure_with_arguments(self, args, mpu):
    method _do_args_sanity_check (line 929) | def _do_args_sanity_check(self, args):
    method _is_supported_optimizer (line 959) | def _is_supported_optimizer(self, optimizer_name):
    method _supported_optims (line 965) | def _supported_optims(self):
    method _do_sanity_check (line 979) | def _do_sanity_check(self):
    method _broadcast_model (line 1006) | def _broadcast_model(self):
    method __check_params (line 1026) | def __check_params(model: Module, dtype: torch.dtype) -> None:
    method _set_client_model (line 1036) | def _set_client_model(self, model):
    method _configure_distributed_model (line 1043) | def _configure_distributed_model(self, model):
    method _check_for_duplicates (line 1109) | def _check_for_duplicates(self, optimizer):
    method _do_optimizer_sanity_check (line 1123) | def _do_optimizer_sanity_check(self, basic_optimizer):
    method _configure_optimizer (line 1179) | def _configure_optimizer(self, client_optimizer, model_parameters):
    method _configure_basic_optimizer (line 1233) | def _configure_basic_optimizer(self, model_parameters):
    method _configure_compression_scheduler (line 1313) | def _configure_compression_scheduler(self):
    method _configure_quantization (line 1316) | def _configure_quantization(self):
    method _configure_fp16_optimizer (line 1347) | def _configure_fp16_optimizer(self, optimizer):
    method _configure_bf16_optimizer (line 1403) | def _configure_bf16_optimizer(self, optimizer):
    method _configure_zero_optimizer (line 1423) | def _configure_zero_optimizer(self, optimizer):
    method _configure_eigenvalue (line 1541) | def _configure_eigenvalue(self):
    method _configure_progressive_layer_drop (line 1554) | def _configure_progressive_layer_drop(self):
    method _configure_curriculum_scheduler (line 1559) | def _configure_curriculum_scheduler(self):
    method is_map_style_dataset (line 1564) | def is_map_style_dataset(obj):
    method is_iterable_style_dataset (line 1568) | def is_iterable_style_dataset(obj):
    method dataloader_drop_last (line 1573) | def dataloader_drop_last(self):
    method was_step_applied (line 1576) | def was_step_applied(self) -> bool:
    method deepspeed_io (line 1586) | def deepspeed_io(self,
    method train (line 1631) | def train(self, mode=True):
    method eval (line 1637) | def eval(self):
    method _scale_loss_by_gas (line 1643) | def _scale_loss_by_gas(self, prescaled_loss):
    method forward (line 1664) | def forward(self, *inputs, **kwargs):
    method _cast_inputs_half (line 1750) | def _cast_inputs_half(self, inputs):
    method print_forward_breakdown (line 1766) | def print_forward_breakdown(self, fwd_time):
    method allreduce_gradients (line 1790) | def allreduce_gradients(self, bucket_size=MEMORY_OPT_ALLREDUCE_SIZE):
    method backward (line 1811) | def backward(self,
    method is_gradient_accumulation_boundary (line 1902) | def is_gradient_accumulation_boundary(self):
    method set_gradient_accumulation_boundary (line 1915) | def set_gradient_accumulation_boundary(self, is_boundary):
    method zero_grad (line 1938) | def zero_grad(self):
    method clip_fp32_gradients (line 1945) | def clip_fp32_gradients(self):
    method _take_model_step (line 1950) | def _take_model_step(self, lr_kwargs, block_eigenvalue={}):
    method step (line 2018) | def step(self, lr_kwargs=None):
    method _start_timers (line 2132) | def _start_timers(self, timer_names):
    method _stop_timers (line 2136) | def _stop_timers(self, timer_names):
    method _autotuning_exit (line 2143) | def _autotuning_exit(self):
    method _write_monitor (line 2166) | def _write_monitor(self):
    method _get_optimizer_param (line 2197) | def _get_optimizer_param(self, param_name):
    method get_lr (line 2208) | def get_lr(self):
    method get_type (line 2211) | def get_type(self):
    method get_mom (line 2214) | def get_mom(self):
    method get_pld_theta (line 2220) | def get_pld_theta(self):
    method _report_progress (line 2226) | def _report_progress(self, step):
    method allreduce_bucket (line 2232) | def allreduce_bucket(self, bucket, dp_group):
    method allreduce_and_copy (line 2259) | def allreduce_and_copy(self, small_bucket, dp_group):
    method allreduce_no_retain (line 2264) | def allreduce_no_retain(self, bucket, dp_group, numel_per_bucket=50000...
    method _get_gradients_for_reduction (line 2277) | def _get_gradients_for_reduction(self):
    method _reduce_non_expert_gradients (line 2307) | def _reduce_non_expert_gradients(self, grads, elements_per_buffer):
    method _reduce_expert_gradients (line 2324) | def _reduce_expert_gradients(self, expert_grads, elements_per_buffer):
    method buffered_allreduce_fallback (line 2340) | def buffered_allreduce_fallback(self, grads=None, elements_per_buffer=...
    method sparse_allreduce_no_retain (line 2352) | def sparse_allreduce_no_retain(self, bucket, dp_group):
    method sparse_allreduce_bucket (line 2361) | def sparse_allreduce_bucket(self, bucket, dp_group):
    method sparse_allreduce (line 2367) | def sparse_allreduce(self, sparse, dp_group):
    method sparse_all_gather (line 2393) | def sparse_all_gather(self, value, dp_group):
    method all_gather_scalar (line 2428) | def all_gather_scalar(self, value, dp_group):
    method module_state_dict (line 2436) | def module_state_dict(self, destination=None, prefix="", keep_vars=Fal...
    method load_moe_state_dict (line 2441) | def load_moe_state_dict(checkpoint_path,
    method load_module_state_dict (line 2503) | def load_module_state_dict(self, state_dict, strict=True, custom_load_...
    method _get_zero_ckpt_prefix (line 2510) | def _get_zero_ckpt_prefix(self, dp_rank, bf16_mode):
    method _get_rank_zero_ckpt_name (line 2513) | def _get_rank_zero_ckpt_name(self,
    method _get_zero_ckpt_name (line 2527) | def _get_zero_ckpt_name(self, checkpoints_path, tag):
    method _get_ckpt_name (line 2537) | def _get_ckpt_name(self, checkpoints_path, tag, mp_placeholder=None):
    method _get_optimizer_ckpt_name (line 2560) | def _get_optimizer_ckpt_name(self, checkpoints_path, tag, expp_rank):
    method _get_expert_ckpt_name (line 2569) | def _get_expert_ckpt_name(checkpoints_path, layer_id, expert_id, tag, ...
    method _get_all_ckpt_names (line 2586) | def _get_all_ckpt_names(self, checkpoints_path, tag):
    method load_checkpoint (line 2597) | def load_checkpoint(self,
    method _load_checkpoint (line 2669) | def _load_checkpoint(self,
    method _load_zero_checkpoint (line 2813) | def _load_zero_checkpoint(self, load_dir, tag, load_optimizer_states=T...
    method _get_mp_rank_zero_checkpoint_names (line 2845) | def _get_mp_rank_zero_checkpoint_names(self,
    method _get_all_zero_checkpoint_names (line 2862) | def _get_all_zero_checkpoint_names(self, load_dir, tag, bf16_mode):
    method _get_all_zero_checkpoint_state_dicts (line 2882) | def _get_all_zero_checkpoint_state_dicts(self, zero_ckpt_names):
    method _get_all_zero_checkpoints (line 2905) | def _get_all_zero_checkpoints(self, load_dir, tag):
    method _checkpoint_tag_validation (line 2923) | def _checkpoint_tag_validation(self, tag):
    method save_checkpoint (line 2941) | def save_checkpoint(self, save_dir, tag=None, client_state={}, save_la...
    method _get_non_moe_state_dict (line 3007) | def _get_non_moe_state_dict(self, full_state_dict):
    method _save_moe_checkpoint (line 3017) | def _save_moe_checkpoint(self, save_dir, tag, client_state={}):
    method _create_checkpoint_file (line 3127) | def _create_checkpoint_file(self, save_dir, tag, zero_checkpoint):
    method _create_zero_checkpoint_files (line 3139) | def _create_zero_checkpoint_files(self, save_dir, tag):
    method _save_checkpoint (line 3150) | def _save_checkpoint(self, save_dir, tag, client_state={}):
    method _get_buffer_names (line 3186) | def _get_buffer_names(self):
    method _get_zero_param_shapes (line 3207) | def _get_zero_param_shapes(self):
    method _copy_recovery_script (line 3249) | def _copy_recovery_script(self, save_path):
    method _save_zero_checkpoint (line 3259) | def _save_zero_checkpoint(self, save_path, tag):
    method _zero3_consolidated_16bit_state_dict (line 3271) | def _zero3_consolidated_16bit_state_dict(self):
    method save_fp16_model (line 3336) | def save_fp16_model(self, save_dir, save_filename="pytorch_model.bin"):
    method save_16bit_model (line 3341) | def save_16bit_model(self, save_dir, save_filename="pytorch_model.bin"):

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/fp16/fused_optimizer.py
  class FP16_Optimizer (line 19) | class FP16_Optimizer(DeepSpeedOptimizer):
    method __init__ (line 25) | def __init__(self,
    method initialize_optimizer_states (line 120) | def initialize_optimizer_states(self):
    method zero_grad (line 133) | def zero_grad(self, set_grads_to_None=True):
    method step_fused_adam (line 147) | def step_fused_adam(self, closure=None):
    method start_timers (line 199) | def start_timers(self, name_list):
    method stop_timers (line 204) | def stop_timers(self, name_list):
    method log_timers (line 209) | def log_timers(self, name_list):
    method set_lr (line 213) | def set_lr(self, lr):
    method get_lr (line 218) | def get_lr(self):
    method override_loss_scale (line 222) | def override_loss_scale(self, loss_scale):
    method step (line 230) | def step(self, closure=None):
    method _get_norm_with_moe_layers (line 328) | def _get_norm_with_moe_layers(self, all_groups_norm):
    method unscale_and_clip_grads (line 344) | def unscale_and_clip_grads(self, grad_groups_flat, total_norm, apply_s...
    method backward (line 359) | def backward(self, loss, create_graph=False, retain_graph=False):
    method _update_scale (line 374) | def _update_scale(self, skip):
    method _get_state (line 405) | def _get_state(self):
    method _set_state (line 408) | def _set_state(self, value):
    method _get_param_groups (line 415) | def _get_param_groups(self):
    method _set_param_groups (line 418) | def _set_param_groups(self, value):
    method state_dict (line 423) | def state_dict(self):
    method refresh_fp32_params (line 448) | def refresh_fp32_params(self):
    method load_state_dict (line 452) | def load_state_dict(self, state_dict, load_optimizer_states=True):
    method __repr__ (line 496) | def __repr__(self):
    method _get_loss_scale (line 500) | def _get_loss_scale(self):
    method _set_loss_scale (line 506) | def _set_loss_scale(self, value):

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/fp16/loss_scaler.py
  function to_python_float (line 26) | def to_python_float(t):
  class LossScalerBase (line 32) | class LossScalerBase:
    method __init__ (line 36) | def __init__(self, cur_scale):
    method loss_scale (line 40) | def loss_scale(self):
    method scale_gradient (line 43) | def scale_gradient(self, module, grad_in, grad_out):
    method update_scale (line 46) | def update_scale(self, overflow):
    method backward (line 49) | def backward(self, loss, retain_graph=False):
  class LossScaler (line 54) | class LossScaler(LossScalerBase):
    method __init__ (line 65) | def __init__(self, scale=1):
    method has_overflow (line 69) | def has_overflow(self, params):
    method _has_inf_or_nan (line 73) | def _has_inf_or_nan(x):
  class DynamicLossScaler (line 77) | class DynamicLossScaler(LossScalerBase):
    method __init__ (line 102) | def __init__(self,
    method has_overflow_serial (line 122) | def has_overflow_serial(self, params):
    method _has_inf_or_nan (line 130) | def _has_inf_or_nan(x):
    method update_scale (line 151) | def update_scale(self, overflow):

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/fp16/onebit/adam.py
  class OnebitAdam (line 10) | class OnebitAdam(torch.optim.Optimizer):
    method __init__ (line 41) | def __init__(self,
    method step (line 106) | def step(self, closure=None, grads=None):
    method load_state_dict (line 262) | def load_state_dict(self, state_dict):

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/fp16/onebit/lamb.py
  class OnebitLamb (line 11) | class OnebitLamb(torch.optim.Optimizer):
    method __init__ (line 56) | def __init__(self,
    method step (line 137) | def step(self, closure=None, grads=None):
    method load_state_dict (line 406) | def load_state_dict(self, state_dict):
    method get_lamb_coeffs (line 468) | def get_lamb_coeffs(self):

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/fp16/onebit/zoadam.py
  class ZeroOneAdam (line 10) | class ZeroOneAdam(torch.optim.Optimizer):
    method __init__ (line 51) | def __init__(self,
    method step (line 119) | def step(self, closure=None, grads=None):
    method load_state_dict (line 333) | def load_state_dict(self, state_dict):

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/fp16/unfused_optimizer.py
  class FP16_UnfusedOptimizer (line 20) | class FP16_UnfusedOptimizer(DeepSpeedOptimizer):
    method __init__ (line 26) | def __init__(self,
    method zero_grad (line 113) | def zero_grad(self, set_grads_to_None=True):
    method step_fused_lamb (line 128) | def step_fused_lamb(self, closure=None):
    method set_lr (line 190) | def set_lr(self, lr):
    method get_lr (line 195) | def get_lr(self):
    method override_loss_scale (line 199) | def override_loss_scale(self, loss_scale):
    method step (line 207) | def step(self, closure=None):
    method unscale_and_clip_grads (line 260) | def unscale_and_clip_grads(self, total_norm, apply_scale=True):
    method backward (line 277) | def backward(self, loss, create_graph=False, retain_graph=False):
    method _update_scale (line 292) | def _update_scale(self, skip):
    method _get_state (line 323) | def _get_state(self):
    method _set_state (line 326) | def _set_state(self, value):
    method _get_param_groups (line 333) | def _get_param_groups(self):
    method _set_param_groups (line 336) | def _set_param_groups(self, value):
    method _get_loss_scale (line 342) | def _get_loss_scale(self):
    method _set_loss_scale (line 348) | def _set_loss_scale(self, value):
    method state_dict (line 353) | def state_dict(self):
    method refresh_fp32_params (line 377) | def refresh_fp32_params(self):
    method load_state_dict (line 382) | def load_state_dict(self, state_dict, load_optimizer_states=True):
    method __repr__ (line 427) | def __repr__(self):
    method initialize_optimizer_states (line 430) | def initialize_optimizer_states(self):

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/lr_schedules.py
  function add_tuning_arguments (line 55) | def add_tuning_arguments(parser):
  function parse_arguments (line 159) | def parse_arguments():
  function override_lr_range_test_params (line 167) | def override_lr_range_test_params(args, params):
  function override_1cycle_params (line 184) | def override_1cycle_params(args, params):
  function override_warmupLR_params (line 223) | def override_warmupLR_params(args, params):
  function override_params (line 237) | def override_params(args, params):
  function get_config_from_args (line 248) | def get_config_from_args(args):
  function get_lr_from_config (line 269) | def get_lr_from_config(config):
  function get_torch_optimizer (line 297) | def get_torch_optimizer(optimizer):
  class LRRangeTest (line 308) | class LRRangeTest(object):
    method __init__ (line 346) | def __init__(self,
    method _staircase_interval (line 376) | def _staircase_interval(self):
    method _continuous_interval (line 379) | def _continuous_interval(self):
    method _get_increase (line 382) | def _get_increase(self):
    method get_lr (line 385) | def get_lr(self):
    method get_last_lr (line 391) | def get_last_lr(self):
    method _update_optimizer (line 397) | def _update_optimizer(self, group_lrs):
    method step (line 401) | def step(self, batch_iteration=None):
    method state_dict (line 408) | def state_dict(self):
    method load_state_dict (line 411) | def load_state_dict(self, sd):
  class OneCycle (line 415) | class OneCycle(object):
    method __init__ (line 483) | def __init__(self,
    method _initialize_cycle (line 529) | def _initialize_cycle(self,
    method _initialize_lr (line 554) | def _initialize_lr(self,
    method _initialize_momentum (line 572) | def _initialize_momentum(self,
    method _get_scale_factor (line 597) | def _get_scale_factor(self):
    method _get_cycle_mom (line 608) | def _get_cycle_mom(self):
    method _get_cycle_lr (line 619) | def _get_cycle_lr(self):
    method _get_decay_mom (line 629) | def _get_decay_mom(self, decay_batch_iteration):
    method _get_decay_lr (line 639) | def _get_decay_lr(self, decay_batch_iteration):
    method get_lr (line 653) | def get_lr(self):
    method get_mom (line 661) | def get_mom(self):
    method get_last_lr (line 672) | def get_last_lr(self):
    method step (line 678) | def step(self, batch_iteration=None):
    method state_dict (line 697) | def state_dict(self):
    method load_state_dict (line 700) | def load_state_dict(self, sd):
  class WarmupLR (line 704) | class WarmupLR(object):
    method __init__ (line 725) | def __init__(self,
    method get_lr (line 749) | def get_lr(self):
    method get_last_lr (line 761) | def get_last_lr(self):
    method step (line 767) | def step(self, last_batch_iteration=None):
    method state_dict (line 775) | def state_dict(self):
    method load_state_dict (line 778) | def load_state_dict(self, sd):
    method _get_gamma (line 781) | def _get_gamma(self):
    method _format_param (line 789) | def _format_param(self, optimizer, param_value, param_name):
  class WarmupDecayLR (line 800) | class WarmupDecayLR(WarmupLR):
    method __init__ (line 822) | def __init__(self,
    method _get_gamma (line 844) | def _get_gamma(self):

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/pipe/engine.py
  function is_even (line 24) | def is_even(number):
  function _tensor_bytes (line 32) | def _tensor_bytes(tensor):
  class PipelineEngine (line 36) | class PipelineEngine(DeepSpeedEngine):
    method __init__ (line 58) | def __init__(self, has_bool_tensors=False, *super_args, **super_kwargs):
    method set_has_attention_mask (line 219) | def set_has_attention_mask(self, value):
    method _build_data_iter (line 223) | def _build_data_iter(self, dataset):
    method _exec_reduce_tied_grads (line 234) | def _exec_reduce_tied_grads(self):
    method _exec_reduce_grads (line 251) | def _exec_reduce_grads(self):
    method _bf16_reduce_grads (line 264) | def _bf16_reduce_grads(self):
    method _reserve_pipe_buffers (line 270) | def _reserve_pipe_buffers(self, num_buffers):
    method reset_activation_shape (line 286) | def reset_activation_shape(self):
    method train_batch (line 296) | def train_batch(self, data_iter=None):
    method eval_batch (line 381) | def eval_batch(self,
    method set_train_batch_size (line 469) | def set_train_batch_size(self, train_batch_size):
    method is_first_stage (line 482) | def is_first_stage(self):
    method is_last_stage (line 486) | def is_last_stage(self):
    method _reduce_outputs (line 490) | def _reduce_outputs(self, outputs, reduce='avg', reduce_dp=True):
    method _bcast_pipe_scalar (line 522) | def _bcast_pipe_scalar(self, data, src_rank=None, dtype=torch.float32):
    method _aggregate_total_loss (line 539) | def _aggregate_total_loss(self):
    method set_dataloader (line 571) | def set_dataloader(self, loader):
    method set_dataiterator (line 577) | def set_dataiterator(self, iterator):
    method set_batch_fn (line 583) | def set_batch_fn(self, fn):
    method is_gradient_accumulation_boundary (line 591) | def is_gradient_accumulation_boundary(self):
    method log_for_device (line 602) | def log_for_device(self, *msg):
    method tput_log (line 615) | def tput_log(self, *msg):
    method _next_batch (line 619) | def _next_batch(self):
    method _exec_forward_pass (line 631) | def _exec_forward_pass(self, buffer_id):
    method _exec_backward_pass (line 711) | def _exec_backward_pass(self, buffer_id):
    method _exec_load_micro_batch (line 787) | def _exec_load_micro_batch(self, buffer_id):
    method _send_tensor_meta (line 828) | def _send_tensor_meta(self, buffer, recv_stage):
    method _recv_tensor_meta (line 893) | def _recv_tensor_meta(self, send_stage):
    method _exec_send_activations (line 947) | def _exec_send_activations(self, buffer_id):
    method _exec_send_grads (line 983) | def _exec_send_grads(self, buffer_id):
    method _exec_recv_activations (line 1042) | def _exec_recv_activations(self, buffer_id):
    method _exec_recv_grads (line 1087) | def _exec_recv_grads(self, buffer_id):
    method _exec_optimizer_step (line 1154) | def _exec_optimizer_step(self, lr_kwargs=None):
    method _zero_grads (line 1198) | def _zero_grads(self, inputs):
    method _allocate_zeros (line 1207) | def _allocate_zeros(self, shape, **kwargs):
    method _allocate_buffer (line 1225) | def _allocate_buffer(self, shape, num_buffers=-1, **kwargs):
    method _allocate_buffers (line 1233) | def _allocate_buffers(self, shapes_and_dtypes, requires_grad=False, nu...
    method forward (line 1247) | def forward(self, *args, **kwargs):
    method backward (line 1251) | def backward(self, *args, **kwargs):
    method step (line 1255) | def step(self, *args, **kwargs):
    method mem_status (line 1259) | def mem_status(self, msg, print_rank=-1, reset_max=False):
    method module_state_dict (line 1308) | def module_state_dict(self):
    method load_module_state_dict (line 1326) | def load_module_state_dict(self, state_dict, strict=True, custom_load_...
    method _exec_schedule (line 1361) | def _exec_schedule(self, pipe_schedule):

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/pipe/module.py
  class PipelineError (line 19) | class PipelineError(Exception):
  class LayerSpec (line 23) | class LayerSpec:
    method __init__ (line 45) | def __init__(self, typename, *module_args, **module_kwargs):
    method __repr__ (line 58) | def __repr__(self):
    method build (line 63) | def build(self, log=False):
  class TiedLayerSpec (line 71) | class TiedLayerSpec(LayerSpec):
    method __init__ (line 72) | def __init__(self,
  class PipelineModule (line 85) | class PipelineModule(nn.Module):
    method __init__ (line 120) | def __init__(self,
    method _build (line 211) | def _build(self):
    method _count_layer_params (line 267) | def _count_layer_params(self):
    method _find_layer_type (line 286) | def _find_layer_type(self, layername):
    method forward (line 308) | def forward(self, forward_input):
    method _partition_layers (line 364) | def _partition_layers(self, method='uniform'):
    method allreduce_tied_weight_gradients (line 420) | def allreduce_tied_weight_gradients(self):
    method get_tied_weights_and_groups (line 426) | def get_tied_weights_and_groups(self):
    method _synchronize_tied_weights (line 433) | def _synchronize_tied_weights(self):
    method _index_tied_modules (line 442) | def _index_tied_modules(self):
    method partitions (line 498) | def partitions(self):
    method stage_owner (line 501) | def stage_owner(self, layer_idx):
    method _set_bounds (line 508) | def _set_bounds(self, start=None, stop=None):
    method set_checkpoint_interval (line 518) | def set_checkpoint_interval(self, interval):
    method topology (line 522) | def topology(self):
    method mpu (line 526) | def mpu(self):
    method num_pipeline_stages (line 529) | def num_pipeline_stages(self):
    method ckpt_prefix (line 532) | def ckpt_prefix(self, checkpoints_path, tag):
    method ckpt_layer_path (line 548) | def ckpt_layer_path(self, ckpt_dir, local_layer_idx):
    method ckpt_layer_path_list (line 558) | def ckpt_layer_path_list(self, ckpt_dir, local_layer_idx):
    method save_state_dict (line 567) | def save_state_dict(self, save_dir, checkpoint_engine):
    method load_state_dir (line 605) | def load_state_dir(self, load_dir, checkpoint_engine, strict=True):
    method _is_checkpointable (line 631) | def _is_checkpointable(self, funcs):

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/pipe/p2p.py
  function can_send_recv (line 21) | def can_send_recv() -> bool:
  function init_process_groups (line 29) | def init_process_groups(grid):
  function _is_valid_send_recv (line 39) | def _is_valid_send_recv(src_stage, dest_stage):
  function send (line 48) | def send(tensor, dest_stage, async_op=False):
  function recv (line 69) | def recv(tensor, src_stage, async_op=False):
  function wait (line 89) | def wait():
  function send_obj (line 98) | def send_obj(msg: typing.Any, dest: int):
  function recv_obj (line 121) | def recv_obj(sender: int) -> typing.Any:
  function _get_send_recv_group (line 163) | def _get_send_recv_group(src_stage, dest_stage):

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/pipe/schedule.py
  class PipeSchedule (line 6) | class PipeSchedule(ABC):
    method __init__ (line 43) | def __init__(self, micro_batches, stages, stage_id):
    method steps (line 52) | def steps(self):
    method num_pipe_buffers (line 63) | def num_pipe_buffers(self):
    method _valid_micro_batch (line 74) | def _valid_micro_batch(self, micro_batch_id):
    method _valid_stage (line 77) | def _valid_stage(self, stage_id):
    method stage (line 81) | def stage(self):
    method num_stages (line 86) | def num_stages(self):
    method num_micro_batches (line 91) | def num_micro_batches(self):
    method is_first_stage (line 96) | def is_first_stage(self):
    method is_last_stage (line 101) | def is_last_stage(self):
    method _buffer_idx (line 105) | def _buffer_idx(self, micro_batch_id):
    method __iter__ (line 119) | def __iter__(self):
    method __next__ (line 123) | def __next__(self):
  class InferenceSchedule (line 129) | class InferenceSchedule(PipeSchedule):
    method steps (line 132) | def steps(self):
    method num_pipe_buffers (line 173) | def num_pipe_buffers(self):
  class TrainSchedule (line 182) | class TrainSchedule(PipeSchedule):
    method steps (line 189) | def steps(self):
    method num_pipe_buffers (line 243) | def num_pipe_buffers(self):
    method _step_to_micro_batch (line 249) | def _step_to_micro_batch(self, step_id):
    method _even_step_forward_id (line 271) | def _even_step_forward_id(self, step_id):
    method _odd_step_forward_id (line 276) | def _odd_step_forward_id(self, step_id):
    method _even_step_backward_id (line 281) | def _even_step_backward_id(self, step_id):
    method _odd_step_backward_id (line 286) | def _odd_step_backward_id(self, step_id):
  class DataParallelSchedule (line 292) | class DataParallelSchedule(PipeSchedule):
    method steps (line 296) | def steps(self):
    method num_pipe_buffers (line 311) | def num_pipe_buffers(self):
  class PipeInstruction (line 317) | class PipeInstruction:
    method __init__ (line 326) | def __init__(self, **kwargs):
    method __repr__ (line 332) | def __repr__(self):
  class OptimizerStep (line 336) | class OptimizerStep(PipeInstruction):
  class ReduceGrads (line 346) | class ReduceGrads(PipeInstruction):
  class ReduceTiedGrads (line 352) | class ReduceTiedGrads(PipeInstruction):
  class BufferOpInstruction (line 364) | class BufferOpInstruction(PipeInstruction):
    method __init__ (line 370) | def __init__(self, buffer_id, **kwargs):
  class LoadMicroBatch (line 375) | class LoadMicroBatch(BufferOpInstruction):
  class ForwardPass (line 388) | class ForwardPass(BufferOpInstruction):
  class BackwardPass (line 400) | class BackwardPass(BufferOpInstruction):
  class SendActivation (line 416) | class SendActivation(BufferOpInstruction):
  class RecvActivation (line 432) | class RecvActivation(BufferOpInstruction):
  class SendGrad (line 448) | class SendGrad(BufferOpInstruction):
  class RecvGrad (line 463) | class RecvGrad(BufferOpInstruction):
  function _is_even (line 477) | def _is_even(x):
  function _is_odd (line 481) | def _is_odd(x):

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/pipe/topology.py
  class ProcessTopology (line 9) | class ProcessTopology:
    method __init__ (line 22) | def __init__(self, axes, dims):
    method get_rank (line 45) | def get_rank(self, **coord_kwargs):
    method get_axis_names (line 61) | def get_axis_names(self):
    method get_rank_repr (line 65) | def get_rank_repr(self,
    method get_dim (line 99) | def get_dim(self, axis):
    method get_coord (line 111) | def get_coord(self, rank):
    method get_axis_comm_lists (line 128) | def get_axis_comm_lists(self, axis):
    method filter_match (line 168) | def filter_match(self, **filter_kwargs):
    method get_axis_list (line 193) | def get_axis_list(self, axis, idx):
    method world_size (line 210) | def world_size(self):
    method __str__ (line 213) | def __str__(self):
  function _prime_factors (line 217) | def _prime_factors(N):
  class PipeDataParallelTopology (line 232) | class PipeDataParallelTopology(ProcessTopology):
    method __init__ (line 239) | def __init__(self, num_pp, num_dp):
  class PipeModelDataParallelTopology (line 243) | class PipeModelDataParallelTopology(ProcessTopology):
    method __init__ (line 245) | def __init__(self, num_pp, num_mp, num_dp):
  class PipelineParallelGrid (line 249) | class PipelineParallelGrid:
    method __init__ (line 271) | def __init__(self, topology=None, process_group=None):
    method get_stage_id (line 364) | def get_stage_id(self):
    method get_data_parallel_id (line 367) | def get_data_parallel_id(self):
    method _build_p2p_groups (line 370) | def _build_p2p_groups(self):
    method _is_grid_valid (line 387) | def _is_grid_valid(self):
    method stage_to_global (line 395) | def stage_to_global(self, stage_id, **kwargs):
    method topology (line 400) | def topology(self):
    method get_global_rank (line 404) | def get_global_rank(self):
    method get_pipe_parallel_rank (line 407) | def get_pipe_parallel_rank(self):
    method get_pipe_parallel_world_size (line 411) | def get_pipe_parallel_world_size(self):
    method get_pipe_parallel_group (line 415) | def get_pipe_parallel_group(self):
    method get_data_parallel_rank (line 419) | def get_data_parallel_rank(self):
    method get_data_parallel_world_size (line 423) | def get_data_parallel_world_size(self):
    method get_data_parallel_group (line 427) | def get_data_parallel_group(self):
    method get_model_parallel_rank (line 433) | def get_model_parallel_rank(self):
    method get_model_parallel_world_size (line 436) | def get_model_parallel_world_size(self):
    method get_model_parallel_group (line 439) | def get_model_parallel_group(self):
    method get_slice_parallel_rank (line 443) | def get_slice_parallel_rank(self):
    method get_slice_parallel_world_size (line 449) | def get_slice_parallel_world_size(self):
    method get_slice_parallel_group (line 452) | def get_slice_parallel_group(self):

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/progressive_layer_drop.py
  class ProgressiveLayerDrop (line 5) | class ProgressiveLayerDrop(object):
    method __init__ (line 14) | def __init__(self, theta=0.5, gamma=0.001):
    method get_state (line 22) | def get_state(self):
    method get_theta (line 26) | def get_theta(self):
    method update_state (line 29) | def update_state(self, global_step):

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/quantize.py
  class Quantizer (line 9) | class Quantizer(object):
    method __init__ (line 10) | def __init__(self,
    method any_precision_switch (line 33) | def any_precision_switch(self):
    method quantize (line 46) | def quantize(self,
    method step (line 73) | def step(self):
    method quantize_highbit (line 76) | def quantize_highbit(self, inputs, num_bits):
    method quantize_tenary (line 104) | def quantize_tenary(self, inputs):
    method quantize_binary (line 117) | def quantize_binary(self, inputs):
    method mixed_fp16_quantize (line 125) | def mixed_fp16_quantize(self, input, input_q, index):
    method compute_quantization (line 132) | def compute_quantization(self, input, index=0, factor=1):
    method update_fp16_ratio (line 181) | def update_fp16_ratio(self):

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/sparse_tensor.py
  class SparseTensor (line 11) | class SparseTensor(object):
    method __init__ (line 13) | def __init__(self, dense_tensor=None):
    method to_coo_tensor (line 31) | def to_coo_tensor(self):
    method type (line 37) | def type():
    method to_dense (line 40) | def to_dense(self):
    method sparse_size (line 48) | def sparse_size(self):
    method add (line 56) | def add(self, b):
    method __str__ (line 61) | def __str__(self):
    method __repr__ (line 69) | def __repr__(self):

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/state_dict_factory.py
  class SDLoaderFactory (line 20) | class SDLoaderFactory:
    method get_sd_loader_json (line 22) | def get_sd_loader_json(json_file, checkpoint_engine):
    method get_sd_loader (line 42) | def get_sd_loader(ckpt_list, checkpoint_engine, sd_type='Megatron', ve...
  class SDLoaderBase (line 49) | class SDLoaderBase(ABC):
    method __init__ (line 50) | def __init__(self, ckpt_list, version, checkpoint_engine):
    method load (line 58) | def load(self,
    method get_merge_state_dicts (line 116) | def get_merge_state_dicts(self, mp_world_size, mp_rank):
    method get_split_state_dict (line 134) | def get_split_state_dict(self, mp_world_size, mp_rank):
    method _choose_module_key (line 152) | def _choose_module_key(self, sd):
    method get_module (line 160) | def get_module(self, sd):
    method set_module (line 168) | def set_module(self, sd, module):
    method check_ckpt_list (line 177) | def check_ckpt_list(self):
    method merge_state_dict (line 190) | def merge_state_dict(self,
    method split_state_dict (line 200) | def split_state_dict(self,
    method sanity_check (line 210) | def sanity_check(self, ckpt_file_name):
  class MegatronSDLoader (line 214) | class MegatronSDLoader(SDLoaderBase):
    method __init__ (line 215) | def __init__(self, ckpt_list, version, checkpoint_engine):
    method merge_query_key_value (line 243) | def merge_query_key_value(self, param_list, ckpt_ver):
    method split_query_key_value (line 281) | def split_query_key_value(self, param, num_to_split, offset, ckpt_ver):
    method merge_state_dict (line 324) | def merge_state_dict(self,
    method split_state_dict (line 386) | def split_state_dict(self,
    method sanity_check (line 444) | def sanity_check(self, ckpt_file_name):
    method get_checkpoint_version (line 470) | def get_checkpoint_version(self, state_dict):

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/swap_tensor/aio_config.py
  function get_aio_config (line 18) | def get_aio_config(param_dict):

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/swap_tensor/async_swapper.py
  class AsyncTensorSwapper (line 17) | class AsyncTensorSwapper(object):
    method __init__ (line 18) | def __init__(self, aio_handle, numel_alignment, timers):
    method has_buffers (line 33) | def has_buffers(self):
    method add_buffers (line 36) | def add_buffers(self, buffer_list):
    method get_timer_names (line 48) | def get_timer_names(self):
    method release_buffers (line 51) | def release_buffers(self):
    method swap_out_tensors (line 65) | def swap_out_tensors(self, tensor_list, path_list):
    method _report_statistics (line 69) | def _report_statistics(self, message):
    method _swap_out_tensor (line 77) | def _swap_out_tensor(self, tensor, swap_path):
    method _make_swap_space (line 89) | def _make_swap_space(self, numel):
    method _io_aligned_numel (line 101) | def _io_aligned_numel(self, numel):
    method _allocate_buffer (line 105) | def _allocate_buffer(self):
    method _flush_ready_buffers (line 112) | def _flush_ready_buffers(self):
    method _flush_buffers_until_complete (line 119) | def _flush_buffers_until_complete(self):
    method _swap_out_ready_buffers (line 127) | def _swap_out_ready_buffers(self):
    method _wait_for_swap_complete (line 138) | def _wait_for_swap_complete(self):
    method _get_buffer (line 157) | def _get_buffer(self, index):
    method _get_current_buffer (line 161) | def _get_current_buffer(self):
    method _start_timer (line 164) | def _start_timer(self, name):
    method _stop_timer (line 168) | def _stop_timer(self, name):
    method _log_timers (line 172) | def _log_timers(self, name_list, force=False):

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/swap_tensor/optimizer_utils.py
  class FlattenedTensorSwapInfo (line 19) | class FlattenedTensorSwapInfo(object):
    method __init__ (line 20) | def __init__(self, path, length, offset):
  class OptimizerStateSwapInfo (line 26) | class OptimizerStateSwapInfo(object):
    method __init__ (line 27) | def __init__(self, parameter, numel, base_folder):
    method numel (line 40) | def numel(self):
    method has_gradients (line 43) | def has_gradients(self):
    method _add_tensors (line 46) | def _add_tensors(self, tensor_list):
    method add_state_tensors (line 51) | def add_state_tensors(self, tensor_list):
    method device (line 55) | def device(self):
    method dtype (line 58) | def dtype(self):
    method release_memory (line 61) | def release_memory(self):
    method get_or_create_gradient_paths (line 65) | def get_or_create_gradient_paths(self, offsets, lengths):
    method set_swap_buffers (line 81) | def set_swap_buffers(self, buffers):
    method get_swap_gradient_buffers (line 87) | def get_swap_gradient_buffers(self, swap_buffer):
    method get_swap_gradient_paths (line 95) | def get_swap_gradient_paths(self):
    method get_unpinned_state_tensors (line 98) | def get_unpinned_state_tensors(self):
    method read_unswapped_gradients (line 101) | def read_unswapped_gradients(self, dest_buffer):
    method release_unswapped_gradients (line 110) | def release_unswapped_gradients(self):
  class OptimizerSwapper (line 118) | class OptimizerSwapper(object):
    method __init__ (line 119) | def __init__(self,
    method swappable_tensor (line 166) | def swappable_tensor(self, param=None, numel=None):
    method init_timers (line 172) | def init_timers(self):
    method log_timers (line 175) | def log_timers(self):
    method pre_backward (line 179) | def pre_backward(self):
    method post_backward (line 182) | def post_backward(self):
    method _flush_gradient_swapper (line 185) | def _flush_gradient_swapper(self, gradient_swapper):
    method _swap_out_gradients (line 194) | def _swap_out_gradients(self,
    method _initialize_from_swapped_fp16_params (line 241) | def _initialize_from_swapped_fp16_params(self,
    method _swap_in_fp16_params (line 294) | def _swap_in_fp16_params(self,
    method _swap_out_fp16_params (line 333) | def _swap_out_fp16_params(self,
    method _initialize_parameters (line 359) | def _initialize_parameters(self, parameters, src_tensors, aio_handle):
    method _get_swap_paths (line 389) | def _get_swap_paths(self, parameters, num_elems):
    method _swap_out_unpinned_tensors (line 400) | def _swap_out_unpinned_tensors(self,
    method _adjust_for_misaligned_lengths (line 427) | def _adjust_for_misaligned_lengths(self, tensors, offsets):
    method _retrieve_unswapped_grad_partitions (line 455) | def _retrieve_unswapped_grad_partitions(self, swap_info, dest_buffer):
    method _get_state_tensors (line 471) | def _get_state_tensors(self, parameter):
    method _update_param_state_info (line 482) | def _update_param_state_info(self, swap_info, parameter):
    method _create_param_swap_info (line 488) | def _create_param_swap_info(self, parameter, numel):
    method _get_param_swap_info (line 502) | def _get_param_swap_info(self, parameter):
    method _start_timer (line 511) | def _start_timer(self, name):
    method _stop_timer (line 515) | def _stop_timer(self, name):
    method _log_timers (line 519) | def _log_timers(self, name_list, force=False):
    method _io_aligned_numel (line 523) | def _io_aligned_numel(self, numel):

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py
  class PartitionedOptimizerSwapper (line 27) | class PartitionedOptimizerSwapper(OptimizerSwapper):
    method __init__ (line 28) | def __init__(self,
    method initialize_parameters (line 70) | def initialize_parameters(self, parameters, src_tensors):
    method initialize_from_swapped_fp16_params (line 75) | def initialize_from_swapped_fp16_params(self,
    method flush_gradients (line 87) | def flush_gradients(self):
    method swap_in_optimizer_state (line 90) | def swap_in_optimizer_state(self, parameter, async_parameter=None):
    method swap_out_optimizer_state (line 120) | def swap_out_optimizer_state(self, parameter, async_swap=False):
    method swap_out_gradients (line 166) | def swap_out_gradients(self, parameter, gradient_offsets, gradient_ten...
    method _swap_in_parameter (line 172) | def _swap_in_parameter(self, aio_handle, parameter, dest_buffers):
    method _separate_pinned_tensors (line 206) | def _separate_pinned_tensors(self, swap_info):
    method _swap_in_pinned_gradients (line 223) | def _swap_in_pinned_gradients(self, aio_handle, parameter, gradient_te...
    method _swap_in_gradients (line 245) | def _swap_in_gradients(self, aio_handle, parameter, dest_buffer):

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py
  function print_rank_0 (line 19) | def print_rank_0(message, debug=False, force=False):
  class PartitionedParamStatus (line 24) | class PartitionedParamStatus(Enum):
  class AsyncPartitionedParameterSwapper (line 35) | class AsyncPartitionedParameterSwapper(object):
    method __init__ (line 36) | def __init__(self, ds_config, model_dtype):
    method available_swap_in_buffers (line 81) | def available_swap_in_buffers(self):
    method _configure_aio (line 84) | def _configure_aio(self, ds_config):
    method swappable_tensor (line 131) | def swappable_tensor(self, param=None, numel=None):
    method get_path (line 139) | def get_path(self, param, must_exist=False):
    method _get_swap_paths (line 143) | def _get_swap_paths(self, params, must_exist=False):
    method _get_swap_buffers (line 159) | def _get_swap_buffers(self, params):
    method _track_numel (line 169) | def _track_numel(self, params):
    method _allocate_and_return_buffers_for_swap_in (line 174) | def _allocate_and_return_buffers_for_swap_in(self, params):
    method synchronize_writes (line 202) | def synchronize_writes(self):
    method synchronize_reads (line 211) | def synchronize_reads(self):
    method remove_partition_and_release_buffers (line 236) | def remove_partition_and_release_buffers(self, params):
    method _swap_out (line 259) | def _swap_out(self, params, async_op=True):
    method swap_out_and_release (line 274) | def swap_out_and_release(self, params, async_op=False, force_buffer_re...
    method _update_inflight_swap_in (line 280) | def _update_inflight_swap_in(self, params, swap_in_buffers, inflight_n...
    method swap_in (line 291) | def swap_in(self, params, async_op=True, swap_in_buffers=None):
    method swap_into_buffer (line 323) | def swap_into_buffer(self, param, dest_buffer):
    method get_buffer (line 349) | def get_buffer(self, param, numel):
    method reserve_available_buffers (line 369) | def reserve_available_buffers(self):
    method release_reserved_buffers (line 381) | def release_reserved_buffers(self):
    method _io_aligned_numel (line 386) | def _io_aligned_numel(self, numel):
    method _is_io_aligned (line 390) | def _is_io_aligned(self, numel):
    method reserve_partitioned_swap_space (line 393) | def reserve_partitioned_swap_space(self, partition_num_elems):
    method swap_out_partitioned_params (line 401) | def swap_out_partitioned_params(self, dst_fp16_params, src_fp32_params):

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py
  class OptimizerSwapOp (line 18) | class OptimizerSwapOp(object):
    method __init__ (line 19) | def __init__(self,
    method is_parameter (line 34) | def is_parameter(self, parameter):
    method wait (line 37) | def wait(self):
  class PipelinedOptimizerSwapper (line 55) | class PipelinedOptimizerSwapper(OptimizerSwapper):
    method __init__ (line 56) | def __init__(self,
    method initialize_parameters (line 116) | def initialize_parameters(self, parameters, src_tensors):
    method initialize_from_swapped_fp16_params (line 121) | def initialize_from_swapped_fp16_params(self,
    method flush_gradients (line 133) | def flush_gradients(self):
    method swap_in_optimizer_state (line 136) | def swap_in_optimizer_state(self, parameter, async_parameter):
    method swap_out_optimizer_state (line 165) | def swap_out_optimizer_state(self, parameter, async_swap):
    method swap_out_gradients (line 190) | def swap_out_gradients(self, parameter, gradient_offsets, gradient_ten...
    method _complete_swap_out (line 196) | def _complete_swap_out(self, swap_out_type):
    method _swap_out_optimizer_state (line 201) | def _swap_out_optimizer_state(self, aio_handle, parameter, swap_in_op):
    method _swap_in_optimizer_state (line 239) | def _swap_in_optimizer_state(self, aio_handle, parameter):

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/swap_tensor/utils.py
  function swap_in_tensors (line 17) | def swap_in_tensors(swap_handle, tensor_buffers, swap_paths):
  function swap_out_tensors (line 22) | def swap_out_tensors(swap_handle, tensor_buffers, swap_paths):
  function print_object (line 27) | def print_object(obj, name, exclude_list=[]):
  class SwapBuffer (line 35) | class SwapBuffer(object):
    method __init__ (line 36) | def __init__(self, buffer):
    method reset (line 40) | def reset(self):
    method insert_tensor (line 47) | def insert_tensor(self, tensor, swap_path, aligned_numel):
    method allocate_tensor (line 52) | def allocate_tensor(self, swap_path, numel, aligned_numel):
    method has_space (line 68) | def has_space(self, numel):
    method get_swap_tensors (line 71) | def get_swap_tensors(self):
    method get_swap_paths (line 74) | def get_swap_paths(self):
    method get_compute_tensors (line 77) | def get_compute_tensors(self):
    method get_num_elem (line 80) | def get_num_elem(self):
    method get_swap_tensor (line 83) | def get_swap_tensor(self, offset):
    method get_compute_tensor (line 86) | def get_compute_tensor(self, offset):
    method get_swap_path (line 89) | def get_swap_path(self, offset):
  class SwapBufferPool (line 93) | class SwapBufferPool(object):
    method __init__ (line 94) | def __init__(self, buffers):
    method reset (line 99) | def reset(self):
    method allocate_tensor (line 104) | def allocate_tensor(self, numel, swap_path, aligned_numel):
    method insert_tensor (line 111) | def insert_tensor(self, tensor, swap_path, aligned_numel):
    method get_swap_tensors (line 118) | def get_swap_tensors(self):
    method get_swap_paths (line 125) | def get_swap_paths(self):
    method get_compute_tensors (line 132) | def get_compute_tensors(self):
    method has_space (line 139) | def has_space(self, numel):
    method swap_out (line 149) | def swap_out(self, aio_handle, async_op=False):
    method swap_in (line 159) | def swap_in(self, aio_handle, async_op=False):
    method _get_current_buffer (line 169) | def _get_current_buffer(self):
    method _get_used_buffers (line 172) | def _get_used_buffers(self):
  class SwapBufferManager (line 176) | class SwapBufferManager(object):
    method __init__ (line 177) | def __init__(self, num_elems, count, dtype):
    method allocate (line 195) | def allocate(self, num_elems, count, dtype):
    method allocate_all (line 211) | def allocate_all(self, num_elems, dtype):
    method free (line 216) | def free(self, buffers):
  function get_sized_buffer (line 228) | def get_sized_buffer(buffer, num_elems):
  function get_sized_buffers (line 234) | def get_sized_buffers(buffer_list, num_elems_list):

FILE: benchmark/third_party/DeepSpeed/deepspeed/runtime/utils.py
  class DummyOptim (line 37) | class DummyOptim():
    method __init__ (line 42) | def __init__(self, params):
  function noop_decorator (line 47) | def noop_decorator(func):
  function ensure_directory_exists (line 51) | def ensure_directory_exists(filename):
  function set_random_seed (line 61) | def set_random_seed(seed):
  function is_model_parallel_parameter (line 74) | def is_model_parallel_parameter(p) -> bool:
  function bwc_tensor_model_parallel_rank (line 84) | def bwc_tensor_model_parallel_rank(mpu=None):
  function copy_to_device (line 122) | def copy_to_device(item, device, criterion_func):
  function move_to_device (line 146) | def move_to_device(item, device, criterion_func):
  class CheckOverflow (line 172) | class CheckOverflow(object):
    method __init__ (line 174) | def __init__(self,
    method check_using_norm (line 191) | def check_using_norm(self, norm_group, reduce_overflow=True):
    method check (line 214) | def check(self, param_groups=None):
    method has_overflow_serial (line 233) | def has_overflow_serial(self, params):
    method has_overflow (line 239) | def has_overflow(self, params, has_moe_params=None):
    method _has_inf_or_nan (line 283) | def _has_inf_or_nan(x, i):
  function _handle_overflow (line 304) | def _handle_overflow(cpu_sum, x, i):
  function get_global_norm (line 318) | def get_global_norm(norm_list):
  function clip_grad_norm_ (line 327) | def clip_grad_norm_(parameters, max_norm, norm_type=2, mpu=None):
  function get_grad_norm (line 397) | def get_grad_norm(parameters, norm_type=2, mpu=None):
  function get_grad_zeros (line 459) | def get_grad_zeros(parameters, mpu=None):
  function get_weight_norm (line 501) | def get_weight_norm(parameters, norm_type=2, mpu=None):
  function prefix_sum_inc (line 562) | def prefix_sum_inc(weights):
  function partition_uniform (line 575) | def partition_uniform(num_items, num_parts):
  function _lprobe (line 590) | def _lprobe(weights, num_parts, bottleneck):
  function _rb_partition_balanced (line 625) | def _rb_partition_balanced(weights, num_p

Copy disabled (too large) Download .json

Condensed preview — 3433 files, each showing path, character count, and a content snippet. Download the .json file for the full structured content (52,522K chars).

[
  {
    "path": ".gitignore",
    "chars": 611,
    "preview": "# Mac system files\n.DS_store\n\n# built binaries\nbenchmark/third_party/pagecache-mangagement/trunk/fadv\nbenchmark/third_pa"
  },
  {
    "path": "LICENSE",
    "chars": 11418,
    "preview": "Copyright 2023 - The FlexLLMGen team. All rights reserved.\n\n                                  Apache License\n           "
  },
  {
    "path": "README.md",
    "chars": 12664,
    "preview": "# FlexLLMGen: High-throughput Generative Inference of Large Language Models with a Single GPU [[paper](https://arxiv.org"
  },
  {
    "path": "benchmark/batch_size_table.md",
    "chars": 1891,
    "preview": "## Effective Batch Size of Each System\n\n### Setup\n- Hardware: an NVIDIA T4 (16GB) instance on GCP with 208GB of DRAM and"
  },
  {
    "path": "benchmark/flexgen/bench_scan_175b.sh",
    "chars": 157,
    "preview": "python3 -m flexgen.flex_opt --model facebook/opt-175b --path _DUMMY_ --pin-weight 0 --percent 0 100 100 0 100 0 --gpu-ba"
  },
  {
    "path": "benchmark/flexllmgen/README.md",
    "chars": 1419,
    "preview": "# Benchmark FlexLLMGen\nNOTE: This benchmark uses dummy weights by default for faster experiments.\nIt is expected if you "
  },
  {
    "path": "benchmark/flexllmgen/bench_175b_1x4.sh",
    "chars": 758,
    "preview": "#!/bin/bash\n\nMY_IPADDR=$(hostname -i)\nall_hosts=$MY_IPADDR\nN_GPUS=4\nN_CORES_PER_GPU=12\n\nPYTHON_EXEC=$CONDA_PREFIX/bin/py"
  },
  {
    "path": "benchmark/flexllmgen/bench_175b_4x1.sh",
    "chars": 1147,
    "preview": "#!/bin/bash\n\nN_GPUS=1\nN_NODES=4\nN_CORES_PER_GPU=16\n\nMY_IPADDR=$(hostname -i)\nall_public_ips=$(ray get-worker-ips ~/ray_b"
  },
  {
    "path": "benchmark/flexllmgen/bench_30b_1x4.sh",
    "chars": 736,
    "preview": "#!/bin/bash\n\nMY_IPADDR=$(hostname -i)\nall_hosts=$MY_IPADDR\nN_GPUS=4\nN_CORES_PER_GPU=12\n\nPYTHON_EXEC=$CONDA_PREFIX/bin/py"
  },
  {
    "path": "benchmark/flexllmgen/bench_30b_4x1.sh",
    "chars": 1139,
    "preview": "#!/bin/bash\n\nN_GPUS=1\nN_NODES=4\nN_CORES_PER_GPU=16\n\nMY_IPADDR=$(hostname -i)\nall_public_ips=$(ray get-worker-ips ~/ray_b"
  },
  {
    "path": "benchmark/flexllmgen/bench_6.7b_1x4.sh",
    "chars": 724,
    "preview": "#!/bin/bash\n\nMY_IPADDR=$(hostname -i)\nall_hosts=$MY_IPADDR\nN_GPUS=4\nN_CORES_PER_GPU=6\n\nPYTHON_EXEC=$CONDA_PREFIX/bin/pyt"
  },
  {
    "path": "benchmark/flexllmgen/bench_6.7b_4x1.sh",
    "chars": 1064,
    "preview": "#!/bin/bash\n\nN_GPUS=1\nN_NODES=4\nN_CORES_PER_GPU=16\n\nMY_IPADDR=$(hostname -i)\nall_public_ips=$(ray get-worker-ips ~/ray_b"
  },
  {
    "path": "benchmark/flexllmgen/bench_dist_multi_node.sh",
    "chars": 1067,
    "preview": "#!/bin/bash\n\nN_GPUS=1\nN_NODES=4\nN_CORES_PER_GPU=16\n\nMY_IPADDR=$(hostname -i)\nall_public_ips=$(ray get-worker-ips ~/ray_b"
  },
  {
    "path": "benchmark/flexllmgen/bench_dist_single_node.sh",
    "chars": 682,
    "preview": "#!/bin/bash\n\nMY_IPADDR=$(hostname -i)\nall_hosts=$MY_IPADDR\nN_GPUS=4\nN_CORES_PER_GPU=4\n\nPYTHON_EXEC=$CONDA_PREFIX/bin/pyt"
  },
  {
    "path": "benchmark/flexllmgen/bench_suite.py",
    "chars": 12084,
    "preview": "import argparse\nfrom dataclasses import dataclass\n\nfrom flexllmgen.utils import run_cmd\n\n\n@dataclass\nclass Case:\n    com"
  },
  {
    "path": "benchmark/hf_ds/README.md",
    "chars": 573,
    "preview": "# Benchmark Baselines\n\n## Install\nInstall the forks of Huggingface/transformers and Microsoft/DeepSpeed following this ["
  },
  {
    "path": "benchmark/hf_ds/bench_all_1x4.sh",
    "chars": 611,
    "preview": "python3 hf_opt.py --num-gpus 4 --model facebook/opt-6.7b --dummy --cut-gen-len 5 --batch-size 16\ndeepspeed --num_gpus 4 "
  },
  {
    "path": "benchmark/hf_ds/bench_ds_175b_4x1.sh",
    "chars": 163,
    "preview": "deepspeed --num_nodes 4 --num_gpus 1 --master_port 7778 --hostfile hostfile \\\n    hf_opt.py --model facebook/opt-175b --"
  },
  {
    "path": "benchmark/hf_ds/bench_ds_30b_1x4.sh",
    "chars": 104,
    "preview": "deepspeed --num_gpus 4 hf_opt.py --model facebook/opt-30b --batch-size 24 --cut-gen-len 5 --cpu --dummy\n"
  },
  {
    "path": "benchmark/hf_ds/bench_ds_30b_4x1.sh",
    "chars": 163,
    "preview": "deepspeed --num_nodes 4 --num_gpus 1 --master_port 7778 --hostfile hostfile \\\n    hf_opt.py --model facebook/opt-30b --b"
  },
  {
    "path": "benchmark/hf_ds/bench_ds_6.7b_1x4.sh",
    "chars": 99,
    "preview": "deepspeed --num_gpus 4 hf_opt.py --model facebook/opt-6.7b --batch-size 48 --cut-gen-len 5 --dummy\n"
  },
  {
    "path": "benchmark/hf_ds/bench_ds_6.7b_2x1.sh",
    "chars": 158,
    "preview": "deepspeed --num_nodes 2 --num_gpus 1 --master_port 7778 --hostfile hostfile \\\n    hf_opt.py --model facebook/opt-6.7b --"
  },
  {
    "path": "benchmark/hf_ds/bench_ds_6.7b_4x1.sh",
    "chars": 158,
    "preview": "deepspeed --num_nodes 4 --num_gpus 1 --master_port 7778 --hostfile hostfile \\\n    hf_opt.py --model facebook/opt-6.7b --"
  },
  {
    "path": "benchmark/hf_ds/bench_hf.py",
    "chars": 4451,
    "preview": "import argparse\nfrom dataclasses import dataclass\nimport time\n\nfrom flexllmgen.utils import run_cmd\n\n\ndef run_huggingfac"
  },
  {
    "path": "benchmark/hf_ds/hf_opt.py",
    "chars": 13230,
    "preview": "\"\"\"\nRun OPT with huggingface or deepspeed.\n\nUsage:\ndeepspeed --num_gpus 1 hf_opt.py --model facebook/opt-1.3b --batch-si"
  },
  {
    "path": "benchmark/hf_ds/hostfile",
    "chars": 44,
    "preview": "172.31.19.249 slots=1\n172.31.29.45  slots=1\n"
  },
  {
    "path": "benchmark/petals/README.md",
    "chars": 3861,
    "preview": "# Running Petals benchmarks\n\nThis guide contains the steps necessary to reproduce experiments in Section 6.3 and Table 1"
  },
  {
    "path": "benchmark/petals/run_opt_requests.py",
    "chars": 4583,
    "preview": "import time\nfrom argparse import ArgumentParser\nfrom statistics import mean\n\nimport torch\nfrom petals import Distributed"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/.clang-format",
    "chars": 4509,
    "preview": "---\n# Refer to the following link for the explanation of each params:\n#   http://releases.llvm.org/8.0.0/tools/clang/doc"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/.github/ISSUE_TEMPLATE/compression_bug_report.md",
    "chars": 1161,
    "preview": "---\nname: Bug report (compression)\nabout: Create a DeepSpeed compression related issue to help us improve\ntitle: \"[BUG]\""
  },
  {
    "path": "benchmark/third_party/DeepSpeed/.github/ISSUE_TEMPLATE/feature_request.md",
    "chars": 613,
    "preview": "---\nname: Feature request\nabout: Suggest an idea for this project\ntitle: \"[REQUEST]\"\nlabels: enhancement\nassignees: ''\n\n"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/.github/ISSUE_TEMPLATE/inference_bug_report.md",
    "chars": 1188,
    "preview": "---\nname: Bug report (inference)\nabout: Create a DeepSpeed inference related issue to help us improve\ntitle: \"[BUG]\"\nlab"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/.github/ISSUE_TEMPLATE/training_bug_report.md",
    "chars": 1152,
    "preview": "---\nname: Bug report (training)\nabout: Create a DeepSpeed training related issue to help us improve\ntitle: \"[BUG]\"\nlabel"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/.github/workflows/amd.yml",
    "chars": 2343,
    "preview": "name: amd\n\non:\n  push:\n    branches:\n      - 'master'\n      - 'staging**'\n    paths-ignore:\n      - 'docs/**'\n  pull_req"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/.github/workflows/formatting.yml",
    "chars": 646,
    "preview": "name: Formatting\n\non:\n  push:\n    branches:\n      - 'master'\n      - 'staging**'\n  pull_request:\n    branches:\n      '**"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/.github/workflows/nv-accelerate-v100.yml",
    "chars": 1955,
    "preview": "name: nv-accelerate-v100\n\non:\n  push:\n    branches:\n      - 'master'\n      - 'staging**'\n    paths-ignore:\n      - 'docs"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/.github/workflows/nv-inference.yml",
    "chars": 2027,
    "preview": "name: nv-inference\n\non:\n  push:\n    branches:\n      - 'master'\n      - 'staging**'\n    paths-ignore:\n      - 'docs/**'\n "
  },
  {
    "path": "benchmark/third_party/DeepSpeed/.github/workflows/nv-lightning-v100.yml",
    "chars": 1575,
    "preview": "name: nv-lightning-v100\n\non:\n  push:\n    branches:\n      - 'master'\n      - 'staging**'\n    paths-ignore:\n      - 'docs/"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/.github/workflows/nv-mii.yml",
    "chars": 1712,
    "preview": "name: nv-mii\n\non:\n  push:\n    branches:\n      - 'master'\n      - 'staging**'\n    paths-ignore:\n      - 'docs/**'\n  pull_"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/.github/workflows/nv-nightly.yml",
    "chars": 2123,
    "preview": "name: nv-nightly\n\non:\n  schedule:\n    - cron: \"0 0 * * *\"\n\nconcurrency:\n  group: ${{ github.workflow }}-${{ github.ref }"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/.github/workflows/nv-torch-latest-v100.yml",
    "chars": 2058,
    "preview": "name: nv-torch-latest-v100\n\non:\n  push:\n    branches:\n      - 'master'\n      - 'staging**'\n    paths-ignore:\n      - 'do"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/.github/workflows/nv-torch-nightly-v100.yml",
    "chars": 1888,
    "preview": "name: nv-torch-nightly-v100\n\non:\n  schedule:\n    - cron: \"0 0 * * *\"\n\nconcurrency:\n  group: ${{ github.workflow }}-${{ g"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/.github/workflows/nv-torch18-p40.yml",
    "chars": 1838,
    "preview": "name: nv-torch18-p40\n\non:\n  push:\n    branches:\n      - 'master'\n      - 'staging**'\n    paths-ignore:\n      - 'docs/**'"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/.github/workflows/nv-torch18-v100.yml",
    "chars": 2076,
    "preview": "name: nv-torch18-v100\n\non:\n  push:\n    branches:\n      - 'master'\n      - 'staging**'\n    paths-ignore:\n      - 'docs/**"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/.github/workflows/nv-transformers-v100.yml",
    "chars": 2499,
    "preview": "name: nv-transformers-v100\n\non:\n  push:\n    branches:\n      - 'master'\n      - 'staging**'\n    paths-ignore:\n      - 'do"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/.github/workflows/pre-compile-ops.yml",
    "chars": 1584,
    "preview": "# This is a basic workflow to help you get started with Actions\n\nname: Tests-w-precompiled-ops\n\n# Controls when the acti"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/.github/workflows/python.yml",
    "chars": 734,
    "preview": "name: python\n\non:\n  push:\n    branches:\n      - 'master'\n      - 'staging**'\n  pull_request:\n    branches:\n      '**'\n\nc"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/.gitignore",
    "chars": 384,
    "preview": "*.pyc\n.idea/\n*~\n*.swp\n*.log\ndeepspeed/git_version_info_installed.py\n__pycache__\n\n# Build + installation data\nbuild/\ndist"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/.pre-commit-config.yaml",
    "chars": 2170,
    "preview": "repos:\n-   repo: meta\n    hooks:\n    -   id: check-hooks-apply\n    -   id: check-useless-excludes\n\n-   repo: https://git"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/.pylintrc",
    "chars": 17879,
    "preview": "[MASTER]\n\n# A comma-separated list of package or module names from where C extensions may\n# be loaded. Extensions are lo"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/.readthedocs.yml",
    "chars": 423,
    "preview": "\n# Required\nversion: 2\n\n# Build documentation in the docs/ directory with Sphinx\nsphinx:\n  configuration: docs/code-docs"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/.style.yapf",
    "chars": 66,
    "preview": "[style]\nSPLIT_ALL_COMMA_SEPARATED_VALUES = true\nCOLUMN_LIMIT = 89\n"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/CODEOWNERS",
    "chars": 2269,
    "preview": "# This file is used to subscribe for notifications for PRs\n# related to specific file paths, does not necessarily mean\n#"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/CODE_OF_CONDUCT.md",
    "chars": 444,
    "preview": "# Microsoft Open Source Code of Conduct\n\nThis project has adopted the [Microsoft Open Source Code of Conduct](https://op"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/CONTRIBUTING.md",
    "chars": 5957,
    "preview": "# Contributing\nDeepSpeed welcomes your contributions!\n\n## Prerequisites\nDeepSpeed uses [pre-commit](https://pre-commit.c"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/LICENSE",
    "chars": 1141,
    "preview": "    MIT License\n\n    Copyright (c) Microsoft Corporation.\n\n    Permission is hereby granted, free of charge, to any pers"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/MANIFEST.in",
    "chars": 255,
    "preview": "include *.txt README.md\nrecursive-include requirements *.txt\nrecursive-include deepspeed *.cpp *.h *.cu *.hip *.tr *.cuh"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/MANIFEST_win.in",
    "chars": 213,
    "preview": "include *.txt README.md\nrecursive-include requirements *.txt\n\n# this is for Windows only\nrecursive-include deepspeed *.t"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/README.md",
    "chars": 22093,
    "preview": "[![License MIT](https://badgen.net/badge/license/MIT/blue)](https://github.com/Microsoft/DeepSpeed/blob/master/LICENSE)\n"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/SECURITY.md",
    "chars": 2824,
    "preview": "<!-- BEGIN MICROSOFT SECURITY.MD V0.0.3 BLOCK -->\n\n## Security\n\nMicrosoft takes the security of our software products an"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/azure/README.md",
    "chars": 283,
    "preview": "# Getting Started with DeepSpeed on Azure\n\nThe recommended and simplest method to try DeepSpeed on Azure is through [Azu"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/benchmarks/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "benchmark/third_party/DeepSpeed/benchmarks/communication/README.md",
    "chars": 2921,
    "preview": "# Running Communication Benchmarks\n\n\nTo run benchmarks, there are two options:\n\n1. Run a single communication operation:"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/benchmarks/communication/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "benchmark/third_party/DeepSpeed/benchmarks/communication/all_gather.py",
    "chars": 6019,
    "preview": "from benchmarks.communication.utils import *\nfrom benchmarks.communication.constants import *\n\nimport time\n\n\n# Run all_g"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/benchmarks/communication/all_reduce.py",
    "chars": 3802,
    "preview": "from benchmarks.communication.utils import *\nfrom benchmarks.communication.constants import *\n\nimport time\n\n\ndef timed_a"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/benchmarks/communication/all_to_all.py",
    "chars": 4791,
    "preview": "from benchmarks.communication.utils import *\nfrom benchmarks.communication.constants import *\n\nimport time\n\n\ndef timed_a"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/benchmarks/communication/broadcast.py",
    "chars": 3811,
    "preview": "import torch\nfrom benchmarks.communication.utils import *\nfrom benchmarks.communication.constants import *\n\nimport time\n"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/benchmarks/communication/constants.py",
    "chars": 158,
    "preview": "DEFAULT_WARMUPS = 5\nDEFAULT_TRIALS = 50\nDEFAULT_TYPE = 'float'\nDEFAULT_BACKEND = 'nccl'\nDEFAULT_UNIT = 'Gbps'\nDEFAULT_DI"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/benchmarks/communication/pt2pt.py",
    "chars": 4320,
    "preview": "from benchmarks.communication.utils import *\nfrom benchmarks.communication.constants import *\n\nimport time\n\n\ndef timed_p"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/benchmarks/communication/run_all.py",
    "chars": 1558,
    "preview": "from benchmarks.communication.utils import *\nfrom benchmarks.communication.all_reduce import run_all_reduce\nfrom benchma"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/benchmarks/communication/utils.py",
    "chars": 7847,
    "preview": "import torch\nimport os\nimport math\nimport argparse\nfrom benchmarks.communication.constants import *\n\nglobal dist\n\n\ndef i"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/benchmarks/inference/bert-bench.py",
    "chars": 3120,
    "preview": "import torch\nimport time\nimport deepspeed\nimport argparse\nfrom transformers import pipeline\n\nparser = argparse.ArgumentP"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/benchmarks/inference/collect_results.py",
    "chars": 4579,
    "preview": "import os\nimport re\nimport argparse\nimport pandas as pd\n\nparser = argparse.ArgumentParser()\nparser.add_argument(\n    \"--"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/benchmarks/inference/gpt-bench.py",
    "chars": 4091,
    "preview": "import os\nimport torch\nimport time\nimport deepspeed\nimport argparse\nfrom transformers import pipeline\n\nparser = argparse"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/benchmarks/inference/requirements.txt",
    "chars": 21,
    "preview": "transformers>=4.21.3\n"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/benchmarks/inference/run_model.sh",
    "chars": 805,
    "preview": "set -x\n\nmodel=$1\nbranch1=$2\nbranch2=$3\ndtype=$4\ngraphs=$5\nkernel=$6\ngpus=$7\n\nversion=0\nlog_path=results/${model}_${dtype"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/benchmarks/inference/sweep.sh",
    "chars": 1392,
    "preview": "set -x\n\nexport TRANSFORMERS_CACHE=/tmp/hf-cache\n\nbranch1=$1\nbranch2=$2\n\ngptneo_models=\"EleutherAI/gpt-neo-2.7B EleutherA"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/bin/ds",
    "chars": 106,
    "preview": "#!/usr/bin/env python3\n\nfrom deepspeed.launcher.runner import main\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/bin/ds_bench",
    "chars": 757,
    "preview": "#!/usr/bin/env python3\n\nfrom benchmarks.communication.run_all import main\nfrom benchmarks.communication.constants import"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/bin/ds_elastic",
    "chars": 1805,
    "preview": "#!/usr/bin/env python3\n\nimport argparse\nimport json\n\nimport deepspeed\nfrom deepspeed.elasticity import compute_elastic_c"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/bin/ds_report",
    "chars": 109,
    "preview": "#!/usr/bin/env python3\n\nfrom deepspeed.env_report import cli_main\n\nif __name__ == '__main__':\n    cli_main()\n"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/bin/ds_ssh",
    "chars": 680,
    "preview": "#!/bin/bash\n\n# Copyright 2020 The Microsoft DeepSpeed Team\n\ncommand -v pdsh\nif [ $? != 0 ]; then\n    echo \"Cannot find p"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/build_win.bat",
    "chars": 337,
    "preview": "@echo off\n\nset DS_BUILD_AIO=0\nset DS_BUILD_SPARSE_ATTN=0\n\necho Administrative permissions required. Detecting permission"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/adagrad/cpu_adagrad.cpp",
    "chars": 8004,
    "preview": "#include \"cpu_adagrad.h\"\n#include <cuda_runtime_api.h>\n#include <math.h>\n#include <omp.h>\n#include <torch/extension.h>\n#"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/adam/cpu_adam.cpp",
    "chars": 9975,
    "preview": "#include \"cpu_adam.h\"\n#include <cuda_runtime_api.h>\n#include <math.h>\n#include <omp.h>\n#include <torch/extension.h>\n#inc"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/adam/fused_adam_frontend.cpp",
    "chars": 781,
    "preview": "#include <torch/extension.h>\n\nvoid multi_tensor_adam_cuda(int chunk_size,\n                            at::Tensor noop_fl"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/adam/multi_tensor_adam.cu",
    "chars": 6524,
    "preview": "/* Copyright 2020 The Microsoft DeepSpeed Team\n   Copyright NVIDIA/apex\n   This file is adapted from fused adam in NVIDI"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/adam/multi_tensor_apply.cuh",
    "chars": 5546,
    "preview": "/* Copyright 2020 The Microsoft DeepSpeed Team\n   Copyright NVIDIA/apex\n   This file is adapted from fused adam in NVIDI"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/aio/common/deepspeed_aio_common.cpp",
    "chars": 13018,
    "preview": "/*\nCopyright 2020 The Microsoft DeepSpeed Team\nLicensed under the MIT license.\n\nFunctionality for swapping optimizer ten"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/aio/common/deepspeed_aio_common.h",
    "chars": 1342,
    "preview": "/*\nCopyright 2020 The Microsoft DeepSpeed Team\nLicensed under the MIT license.\n\nFunctionality for swapping optimizer ten"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/aio/common/deepspeed_aio_types.cpp",
    "chars": 2011,
    "preview": "/*\nCopyright 2020 The Microsoft DeepSpeed Team\nLicensed under the MIT license.\n\nFunctionality for swapping optimizer ten"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/aio/common/deepspeed_aio_types.h",
    "chars": 1380,
    "preview": "/*\nCopyright 2020 The Microsoft DeepSpeed Team\nLicensed under the MIT license.\n\nFunctionality for swapping optimizer ten"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/aio/common/deepspeed_aio_utils.cpp",
    "chars": 4203,
    "preview": "/*\nCopyright 2020 The Microsoft DeepSpeed Team\nLicensed under the MIT license.\n\nFunctionality for swapping optimizer ten"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/aio/common/deepspeed_aio_utils.h",
    "chars": 2064,
    "preview": "/*\nCopyright 2020 The Microsoft DeepSpeed Team\nLicensed under the MIT license.\n\nFunctionality for swapping optimizer ten"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/aio/py_lib/deepspeed_aio_thread.cpp",
    "chars": 2685,
    "preview": "/*\nCopyright 2020 The Microsoft DeepSpeed Team\nLicensed under the MIT license.\n\nFunctionality for swapping optimizer ten"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/aio/py_lib/deepspeed_aio_thread.h",
    "chars": 1405,
    "preview": "/*\nCopyright 2020 The Microsoft DeepSpeed Team\nLicensed under the MIT license.\n\nFunctionality for swapping optimizer ten"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/aio/py_lib/deepspeed_py_aio.cpp",
    "chars": 4228,
    "preview": "\n/*\nCopyright 2020 The Microsoft DeepSpeed Team\nLicensed under the MIT license.\n\nFunctionality for swapping optimizer te"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/aio/py_lib/deepspeed_py_aio.h",
    "chars": 954,
    "preview": "\n/*\nCopyright 2020 The Microsoft DeepSpeed Team\nLicensed under the MIT license.\n\nFunctionality for swapping optimizer te"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp",
    "chars": 9563,
    "preview": "\n/*\nCopyright 2020 The Microsoft DeepSpeed Team\nLicensed under the MIT license.\n\nFunctionality for swapping optimizer te"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/aio/py_lib/deepspeed_py_aio_handle.h",
    "chars": 2128,
    "preview": "/*\nCopyright 2020 The Microsoft DeepSpeed Team\nLicensed under the MIT license.\n\nFunctionality for swapping optimizer ten"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/aio/py_lib/deepspeed_py_copy.cpp",
    "chars": 4400,
    "preview": "/*\nCopyright 2020 The Microsoft DeepSpeed Team\nLicensed under the MIT license.\n\nFunctionality for swapping optimizer ten"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/aio/py_lib/deepspeed_py_copy.h",
    "chars": 1148,
    "preview": "\n/*\nCopyright 2020 The Microsoft DeepSpeed Team\nLicensed under the MIT license.\n\nFunctionality for swapping optimizer te"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/aio/py_lib/py_ds_aio.cpp",
    "chars": 1608,
    "preview": "/*\nCopyright 2020 The Microsoft DeepSpeed Team\nLicensed under the MIT license.\n\nFunctionality for swapping optimizer ten"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/aio/py_test/aio_bench_generate_param.py",
    "chars": 2785,
    "preview": "\"\"\"\nCopyright 2021 The Microsoft DeepSpeed Team\nLicensed under the MIT license.\n\nFunctionality of swapping optimizer ten"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/aio/py_test/aio_bench_perf_sweep.py",
    "chars": 11432,
    "preview": "\"\"\"\nCopyright 2021 The Microsoft DeepSpeed Team\nLicensed under the MIT license.\n\nFunctionality of swapping optimizer ten"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/aio/py_test/ds_aio_basic.py",
    "chars": 4262,
    "preview": "\"\"\"\nCopyright 2020 The Microsoft DeepSpeed Team\nLicensed under the MIT license.\n\nFunctionality of swapping optimizer ten"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/aio/py_test/ds_aio_handle.py",
    "chars": 5080,
    "preview": "\"\"\"\nCopyright 2020 The Microsoft DeepSpeed Team\nLicensed under the MIT license.\n\nFunctionality of swapping optimizer ten"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/aio/py_test/parse_aio_stats.py",
    "chars": 3973,
    "preview": "\"\"\"\nCopyright 2020 The Microsoft DeepSpeed Team\nLicensed under the MIT license.\n\nFunctionality of swapping optimizer ten"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/aio/py_test/perf_sweep_utils.py",
    "chars": 329,
    "preview": "SCRIPT_PREFIX = '_aio_bench'\nWRITE_OP_DESC = 'write'\nREAD_OP_DESC = 'read'\nREAD_IO_DIR = f'{SCRIPT_PREFIX}_{READ_OP_DESC"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/aio/py_test/run_read_sweep.sh",
    "chars": 1963,
    "preview": "#!/bin/bash\nif [[ $# -ne 2 ]]; then\n    echo \"Usage: $0 <input file> <output log dir>\"\n    exit 1\nfi\n\n\nfunction validate"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/aio/py_test/run_write_sweep.sh",
    "chars": 2076,
    "preview": "#!/bin/bash\nfunction prep_folder()\n{\n    folder=$1\n    if [[ -d ${folder} ]]; then\n        rm -f ${folder}/*\n    else\n  "
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/aio/py_test/single_process_config.json",
    "chars": 359,
    "preview": "{\n    \"block_size\": [\n        \"128K\",\n        \"256K\",\n        \"1M\"\n    ],\n    \"queue_depth\": [\n        4,\n        16,\n  "
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/aio/py_test/test_ds_aio.py",
    "chars": 2856,
    "preview": "\"\"\"\nCopyright 2020 The Microsoft DeepSpeed Team\nLicensed under the MIT license.\n\nFunctionality of swapping optimizer ten"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/aio/py_test/test_ds_aio_utils.py",
    "chars": 1772,
    "preview": "\"\"\"\nCopyright 2020 The Microsoft DeepSpeed Team\nLicensed under the MIT license.\n\nFunctionality of swapping optimizer ten"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/aio/py_test/validate_async_io.py",
    "chars": 246,
    "preview": "\"\"\"\nCopyright 2021 The Microsoft DeepSpeed Team\nLicensed under the MIT license.\n\nFunctionality of swapping optimizer ten"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/common/custom_cuda_kernel.cu",
    "chars": 1179,
    "preview": "#include \"custom_cuda_layers.h\"\n\n__global__ void param_update_kernel(const float* input, __half* output, int size)\n{\n   "
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/includes/StopWatch.h",
    "chars": 1882,
    "preview": "#pragma once\n#ifdef _WIN32\n#include <windows.h>\n#else\n#include <time.h>\n#endif\n\n#ifdef _WIN32\n\nclass Stopwatch {\nprivate"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/includes/Timer.h",
    "chars": 1082,
    "preview": "\n#ifndef __TIMER_H__\n#define __TIMER_H__\n\n#include <cuda_runtime.h>\n#include <chrono>\n#include \"cuda.h\"\n\nclass GPUTimer "
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/includes/compat.h",
    "chars": 287,
    "preview": "/* Copyright 2020 The Microsoft DeepSpeed Team\n   Copyright NVIDIA/apex\n   This file is adapted from fused adam in NVIDI"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/includes/context.h",
    "chars": 6818,
    "preview": "#pragma once\n\n#include <ATen/cuda/CUDAContext.h>\n#include <cuda_runtime_api.h>\n#include <cassert>\n#include <iostream>\n#i"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/includes/conversion_utils.h",
    "chars": 12060,
    "preview": "/*\nCopyright 2022 The Microsoft DeepSpeed Team\n*/\n\n#pragma once\n\n#include \"ds_kernel_utils.h\"\n\n#include <cuda_fp16.h>\n#i"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/includes/cpu_adagrad.h",
    "chars": 5014,
    "preview": "#pragma once\n\n#define NOMINMAX  // Windows idiosyncrasy\n                  // https://stackoverflow.com/questions/4913922"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/includes/cpu_adam.h",
    "chars": 7514,
    "preview": "#pragma once\n\n#define NOMINMAX  // Windows idiosyncrasy\n                  // https://stackoverflow.com/questions/4913922"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/includes/cublas_wrappers.h",
    "chars": 3178,
    "preview": "#pragma once\n\n#include <assert.h>\n#include <cublas_v2.h>\n#include <cuda.h>\n#include <cuda_fp16.h>\n#include <cuda_runtime"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/includes/custom_cuda_layers.h",
    "chars": 11025,
    "preview": "/*\nCopyright 2022 The Microsoft DeepSpeed Team\n*/\n\n#pragma once\n\n#include \"ds_kernel_utils.h\"\n#include \"quantization.h\"\n"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/includes/dequantization_utils.h",
    "chars": 7091,
    "preview": "/*\nCopyright 2022 The Microsoft DeepSpeed Team\n*/\n\n#include \"conversion_utils.h\"\n#include \"ds_kernel_utils.h\"\n#include \""
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/includes/dropout.h",
    "chars": 2096,
    "preview": "#pragma once\n\n#include <cuda.h>\n#include <cuda_fp16.h>\n#include <stdio.h>\n\ntemplate <typename T>\nclass Dropout {\npublic:"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/includes/ds_kernel_utils.h",
    "chars": 876,
    "preview": "/*\nCopyright 2022 The Microsoft DeepSpeed Team\n\nCentralized header file for preprocessor macros and constants\nused throu"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/includes/ds_transformer_cuda.h",
    "chars": 6065,
    "preview": "#pragma once\n\n#include <cuda_runtime_api.h>\n#include <curand.h>\n#include <memory>\n#include <vector>\n#include \"cublas_v2."
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/includes/feed_forward.h",
    "chars": 3090,
    "preview": "#ifndef __FEEDFORWARD_H__\n#define __FEEDFORWARD_H__\n\n#include <cuda.h>\n#include <cuda_fp16.h>\n#include <stdio.h>\n#includ"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/includes/gelu.h",
    "chars": 919,
    "preview": "#pragma once\n\n#include <cuda.h>\n#include <cuda_fp16.h>\n#include <stdio.h>\n#include \"custom_cuda_layers.h\"\n\ntemplate <typ"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/includes/gemm_test.h",
    "chars": 10228,
    "preview": "\n#pragma once\n\n#include <cuda_fp16.h>\n#ifndef __HIP_PLATFORM_HCC__\n#include <cuda_profiler_api.h>\n#endif\n#include <array"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/includes/general_kernels.h",
    "chars": 1408,
    "preview": "#include <cuda.h>\n#include <cuda_fp16.h>\n#include <stdio.h>\n#include <stdlib.h>\n\n#ifdef __HIP_PLATFORM_HCC__\n#include <h"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/includes/memory_access_utils.h",
    "chars": 30575,
    "preview": "/*\nCopyright 2022 The Microsoft DeepSpeed Team\n*/\n\n#pragma once\n\n#include <cuda.h>\n#include \"ds_kernel_utils.h\"\n\n///////"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/includes/normalize_layer.h",
    "chars": 6990,
    "preview": "#pragma once\n\n#include <cuda.h>\n#include <cuda_fp16.h>\n#include <stdio.h>\n#include <fstream>\n#include \"custom_cuda_layer"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/includes/quantization.h",
    "chars": 2140,
    "preview": "\n#pragma once\n\n#include <cuda_fp16.h>\n#include \"ds_kernel_utils.h\"\n\nnamespace quantize {\n\nenum class Type { Symmetric, A"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/includes/quantization_utils.h",
    "chars": 18251,
    "preview": "/*\nCopyright 2022 The Microsoft DeepSpeed Team\n*/\n\n#include <cassert>\n#include \"conversion_utils.h\"\n#include \"ds_kernel_"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/includes/quantizer.h",
    "chars": 165,
    "preview": "#pragma once\n\n#include <cooperative_groups.h>\n#include <cuda.h>\n#include <cuda_fp16.h>\n#include <stdio.h>\n#include <stdl"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/includes/reduction_utils.h",
    "chars": 18593,
    "preview": "/*\nCopyright 2022 The Microsoft DeepSpeed Team\n*/\n\n#pragma once\n\n#include \"conversion_utils.h\"\n#include \"ds_kernel_utils"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/includes/simd.h",
    "chars": 4400,
    "preview": "#pragma once\n\n#if (__x86_64__ || __i386__)\n#include <cpuid.h>\n#include <x86intrin.h>\n#endif\n\n#define TILE (128 * 1024 * "
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/includes/softmax.h",
    "chars": 1543,
    "preview": "#pragma once\n\n#include <cuda.h>\n#include <cuda_fp16.h>\n#include <stdio.h>\n#include \"custom_cuda_layers.h\"\n\n#include <fst"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/includes/strided_batch_gemm.h",
    "chars": 6708,
    "preview": "#pragma once\n\n#include <cuda.h>\n#include <cuda_fp16.h>\n#include <stdio.h>\n#include \"context.h\"\n\ntemplate <typename T>\ncl"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/includes/type_shim.h",
    "chars": 6289,
    "preview": "/* Taken from NVIDIA/apex commit 855808f3fc268e9715d613f3c2e56469d8c986d8 */\n#include <ATen/ATen.h>\n\n// Forward/backward"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/lamb/fused_lamb_cuda.cpp",
    "chars": 3953,
    "preview": "/* Copyright 2019 The Microsoft DeepSpeed Team */\n#include <torch/extension.h>\n\n// CUDA forward declaration\nvoid fused_l"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/lamb/fused_lamb_cuda_kernel.cu",
    "chars": 15242,
    "preview": "/* Copyright 2019 The Microsoft DeepSpeed Team */\n#include <cuda.h>\n#include <cuda_runtime.h>\n#include <stdio.h>\n#includ"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/quantization/dequantize.cu",
    "chars": 2254,
    "preview": "/*\nCopyright 2022 The Microsoft DeepSpeed Team\n*/\n\n#include \"dequantization_utils.h\"\n#include \"memory_access_utils.h\"\n\nn"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/quantization/fake_quantizer.cu",
    "chars": 37466,
    "preview": "#include <math.h>\n#include \"custom_cuda_layers.h\"\n#include \"memory_access_utils.h\"\n\nnamespace cg = cooperative_groups;\n\n"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/quantization/pt_binding.cpp",
    "chars": 6439,
    "preview": "#include <ATen/cuda/CUDAContext.h>\n#include <torch/extension.h>\n#include <cassert>\n#include <vector>\n#include \"quantizat"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/quantization/quantize.cu",
    "chars": 8619,
    "preview": "/*\nCopyright 2022 The Microsoft DeepSpeed Team\n*/\n\n#include \"memory_access_utils.h\"\n#include \"quantization.h\"\n#include \""
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/sparse_attention/utils.cpp",
    "chars": 4425,
    "preview": "// DeepSpeed note, code taken & adapted from commit 9aa94789f13ada713af36cfd8cca2fc9a7f6b79a\n// https://github.com/ptill"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/spatial/csrc/opt_bias_add.cu",
    "chars": 6225,
    "preview": "/*\nCopyright 2022 The Microsoft DeepSpeed Team\n*/\n\n#include <cassert>\n#include \"memory_access_utils.h\"\n#include \"spatial"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/spatial/csrc/pt_binding.cpp",
    "chars": 3815,
    "preview": "/*\nCopyright 2022 The Microsoft DeepSpeed Team\n*/\n\n#include <c10/cuda/CUDAStream.h>\n#include <torch/extension.h>\n#includ"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/spatial/includes/spatial_cuda_layers.h",
    "chars": 785,
    "preview": "/*\nCopyright 2022 The Microsoft DeepSpeed Team\n*/\n\n#pragma once\n\n#if __CUDA_ARCH__ >= 530\n#define HALF_PRECISION_AVAILAB"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/transformer/cublas_wrappers.cu",
    "chars": 17473,
    "preview": "#include \"cublas_wrappers.h\"\n\n#ifdef __HIP_PLATFORM_HCC__\nint cublas_gemm_ex(rocblas_handle handle,\n                   r"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/transformer/dropout_kernels.cu",
    "chars": 29712,
    "preview": "#include \"custom_cuda_layers.h\"\n\nconst int unroll_factor = 4;\n\n__global__ void dropout_kernel(const int N,\n             "
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/transformer/ds_transformer_cuda.cpp",
    "chars": 47536,
    "preview": "#include <torch/extension.h>\n\n#include <cublas_v2.h>\n#include <cuda_fp16.h>\n#include <cuda_runtime.h>\n#include <type_tra"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/transformer/gelu_kernels.cu",
    "chars": 12092,
    "preview": "#include \"custom_cuda_layers.h\"\n\ninline __device__ float gelu(const float x)\n{\n    const float sqrt_param = 0.7978845608"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/transformer/general_kernels.cu",
    "chars": 14421,
    "preview": "#include \"general_kernels.h\"\n\nnamespace cg = cooperative_groups;\n\ntemplate <typename T>\n__global__ void column_sum_reduc"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu",
    "chars": 16302,
    "preview": "/*\nCopyright 2022 The Microsoft DeepSpeed Team\n*/\n\n#include \"inference_cuda_layers.h\"\n\n#ifndef __HIP_PLATFORM_HCC__\n#inc"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/transformer/inference/csrc/dequantize.cu",
    "chars": 6860,
    "preview": "/*\nCopyright 2022 The Microsoft DeepSpeed Team\n*/\n\n#include \"inference_cuda_layers.h\"\n\n#define MAX_QUANTIZE_GROUPING 102"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/transformer/inference/csrc/gelu.cu",
    "chars": 28997,
    "preview": "/*\nCopyright 2022 The Microsoft DeepSpeed Team\n*/\n\n#include \"conversion_utils.h\"\n#include \"inference_cuda_layers.h\"\n#inc"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/transformer/inference/csrc/layer_norm.cu",
    "chars": 17470,
    "preview": "/*\nCopyright 2022 The Microsoft DeepSpeed Team\n*/\n\n#include \"conversion_utils.h\"\n#include \"inference_cuda_layers.h\"\n#inc"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/transformer/inference/csrc/pt_binding.cpp",
    "chars": 73568,
    "preview": "/*\nCopyright 2022 The Microsoft DeepSpeed Team\n*/\n\n#include <c10/cuda/CUDAStream.h>\n#include <torch/extension.h>\n#includ"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/transformer/inference/csrc/relu.cu",
    "chars": 2147,
    "preview": "/*\nCopyright 2022 The Microsoft DeepSpeed Team\n*/\n\n#include \"conversion_utils.h\"\n#include \"inference_cuda_layers.h\"\n#inc"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/transformer/inference/csrc/softmax.cu",
    "chars": 22413,
    "preview": "/*\nCopyright 2022 The Microsoft DeepSpeed Team\n*/\n\n#include <limits>\n#include \"inference_cuda_layers.h\"\n\n#ifndef __HIP_P"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/transformer/inference/csrc/transform.cu",
    "chars": 29483,
    "preview": "/*\nCopyright 2022 The Microsoft DeepSpeed Team\n*/\n\n#ifndef __HIP_PLATFORM_HCC__\n#include <cuda_profiler_api.h>\n#endif\n#i"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/transformer/inference/includes/inference_context.h",
    "chars": 9286,
    "preview": "/*\nCopyright 2022 The Microsoft DeepSpeed Team\n*/\n\n#pragma once\n\n#include <c10/cuda/CUDAStream.h>\n#include <cuda_runtime"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/transformer/inference/includes/inference_cublas_wrappers.h",
    "chars": 17691,
    "preview": "/*\nCopyright 2022 The Microsoft DeepSpeed Team\n*/\n\n#pragma once\n\n#include <assert.h>\n#include <cublas_v2.h>\n#include <cu"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/transformer/inference/includes/inference_cuda_layers.h",
    "chars": 8095,
    "preview": "/*\nCopyright 2022 The Microsoft DeepSpeed Team\n*/\n\n#pragma once\n\n#include \"ds_kernel_utils.h\"\n\n#include <cuda.h>\n#includ"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/transformer/normalize_kernels.cu",
    "chars": 74801,
    "preview": "#include \"custom_cuda_layers.h\"\n\nnamespace cg = cooperative_groups;\n\n/*\nFused bias add, residual (elementwise) add, and "
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/transformer/softmax_kernels.cu",
    "chars": 22829,
    "preview": "#include <math.h>\n#include \"custom_cuda_layers.h\"\n#include \"general_kernels.h\"\n\nnamespace cg = cooperative_groups;\n\ndim3"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/transformer/transform_kernels.cu",
    "chars": 22610,
    "preview": "#include \"custom_cuda_layers.h\"\n\n#define rows_trans 16\n#define cols_trans 16\n\ntemplate <typename T>\n__global__ void Tran"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/csrc/utils/flatten_unflatten.cpp",
    "chars": 742,
    "preview": "/*\n   Copyright 2020 The Microsoft DeepSpeed Team\n   Copyright NVIDIA/apex\n   This file is adapted from fused adam in NV"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/deepspeed/__init__.py",
    "chars": 11822,
    "preview": "'''\nCopyright 2020 The Microsoft DeepSpeed Team\n'''\n\nimport sys\nimport types\nimport json\nfrom typing import Optional, Un"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/deepspeed/accelerator/__init__.py",
    "chars": 118,
    "preview": "from .abstract_accelerator import DeepSpeedAccelerator\nfrom .real_accelerator import get_accelerator, set_accelerator\n"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/deepspeed/accelerator/abstract_accelerator.py",
    "chars": 4325,
    "preview": "import abc\nfrom abc import ABC\n\n\nclass DeepSpeedAccelerator(ABC):\n    def __init__(self):\n        self._name = None\n    "
  },
  {
    "path": "benchmark/third_party/DeepSpeed/deepspeed/accelerator/cuda_accelerator.py",
    "chars": 8258,
    "preview": "from deepspeed.accelerator.abstract_accelerator import DeepSpeedAccelerator\nimport torch.cuda\n\n\nclass CUDA_Accelerator(D"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/deepspeed/accelerator/real_accelerator.py",
    "chars": 2758,
    "preview": "from .abstract_accelerator import DeepSpeedAccelerator\n\nds_accelerator = None\n\n\ndef _validate_accelerator(accel_obj):\n  "
  },
  {
    "path": "benchmark/third_party/DeepSpeed/deepspeed/autotuning/.gitignore",
    "chars": 61,
    "preview": "test*\nruns\nautotuning_results*\nautotuning_exps\noutput*\n*.png\n"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/deepspeed/autotuning/README.md",
    "chars": 28079,
    "preview": "# DeepSpeed Autotuning\n## Overview\n\nOne pain point in model training is to figure out good performance-relevant configur"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/deepspeed/autotuning/__init__.py",
    "chars": 33,
    "preview": "from .autotuner import Autotuner\n"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/deepspeed/autotuning/autotuner.py",
    "chars": 53516,
    "preview": "import shutil\nimport subprocess\nimport torch\nimport time\nimport datetime\nimport math\nimport hjson\n\nfrom ..runtime.config"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/deepspeed/autotuning/config.py",
    "chars": 5637,
    "preview": "\"\"\"\nCopyright (c) Microsoft Corporation\nLicensed under the MIT license.\n\"\"\"\n\nfrom deepspeed.runtime.config_utils import "
  },
  {
    "path": "benchmark/third_party/DeepSpeed/deepspeed/autotuning/config_templates/template_zero0.json",
    "chars": 48,
    "preview": "{\n  \"zero_optimization\": {\n    \"stage\": 0\n  }\n}\n"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/deepspeed/autotuning/config_templates/template_zero1.json",
    "chars": 113,
    "preview": "{\n  \"zero_optimization\": {\n    \"stage\": 1,\n    \"reduce_bucket_size\": 5e8,\n    \"allgather_bucket_size\": 5e8\n  }\n}\n"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/deepspeed/autotuning/config_templates/template_zero2.json",
    "chars": 237,
    "preview": "{\n  \"zero_optimization\": {\n    \"stage\": 2,\n    \"allgather_partitions\": true,\n    \"allgather_bucket_size\": 5e8,\n    \"over"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/deepspeed/autotuning/config_templates/template_zero3.json",
    "chars": 485,
    "preview": "{\n  \"zero_optimization\": {\n    \"stage\": 3,\n    \"allgather_partitions\": true,\n    \"allgather_bucket_size\": 5e8,\n    \"over"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/deepspeed/autotuning/constants.py",
    "chars": 6695,
    "preview": "\"\"\"\nCopyright (c) Microsoft Corporation\nLicensed under the MIT license.\n\"\"\"\n\n#########################################\n#"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/deepspeed/autotuning/scheduler.py",
    "chars": 15947,
    "preview": "import copy\n\nfrom numpy import BUFSIZE\nimport json\nimport subprocess\nimport sys\nimport threading\nimport time\nimport base"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/deepspeed/autotuning/tuner/README.md",
    "chars": 619,
    "preview": "# Tuner\n\n\n`exps` is a list of experiment descriptions (dictionaries).\nAn experimentation description has a `ds_config` f"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/deepspeed/autotuning/tuner/__init__.py",
    "chars": 139,
    "preview": "from .index_based_tuner import RandomTuner, GridSearchTuner\n# from .ga_tuner import GATuner\nfrom .model_based_tuner impo"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/deepspeed/autotuning/tuner/base_tuner.py",
    "chars": 2607,
    "preview": "import sys\n\nfrom deepspeed.autotuning.constants import *\nfrom deepspeed.autotuning.utils import write_experiments\nfrom d"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/deepspeed/autotuning/tuner/cost_model.py",
    "chars": 1723,
    "preview": "from .utils import *\n\ntry:\n    import xgboost as xgb\nexcept ImportError:\n    xgb = None\n\n\nclass XGBoostCostModel():\n    "
  },
  {
    "path": "benchmark/third_party/DeepSpeed/deepspeed/autotuning/tuner/index_based_tuner.py",
    "chars": 1060,
    "preview": "import random\n\nfrom .base_tuner import BaseTuner\n\n\nclass RandomTuner(BaseTuner):\n    \"\"\"Explore the search space in rand"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/deepspeed/autotuning/tuner/model_based_tuner.py",
    "chars": 5730,
    "preview": "import hjson\n\nfrom ..constants import AUTOTUNING, AUTOTUNING_METRIC_PATH\nfrom .base_tuner import BaseTuner\nfrom .cost_mo"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/deepspeed/autotuning/tuner/utils.py",
    "chars": 2311,
    "preview": "import numpy as np\nimport itertools\nfrom ..utils import *\nimport collections.abc\n\n\ndef index_to_feature(p, dims):\n    \"\""
  },
  {
    "path": "benchmark/third_party/DeepSpeed/deepspeed/autotuning/utils.py",
    "chars": 15110,
    "preview": "import re\nimport collections.abc\nimport os\nimport json\nfrom deepspeed.runtime.constants import GRADIENT_ACCUMULATION_STE"
  },
  {
    "path": "benchmark/third_party/DeepSpeed/deepspeed/checkpoint/__init__.py",
    "chars": 505,
    "preview": "from .reshape_meg_2d import reshape_meg_2d_parallel\n\nfrom .deepspeed_checkpoint import DeepSpeedCheckpoint\n\nfrom .utils "
  },
  {
    "path": "benchmark/third_party/DeepSpeed/deepspeed/checkpoint/constants.py",
    "chars": 2244,
    "preview": "'''\n    Various symbolic constants used for model checkpointing\n'''\n\n#########################################\n# Optimiz"
  }
]

// ... and 3233 more files (download for full content)

About this extraction

This page contains the full source code of the FMInference/FlexLLMGen GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 3433 files (48.1 MB), approximately 12.8M tokens, and a symbol index with 42052 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo