Repository: facebookresearch/xformers Branch: main Commit: ce4f89d54286 Files: 905 Total size: 3.3 MB Directory structure: gitextract_wkps4m_l/ ├── .clang-format ├── .coveragerc ├── .editorconfig ├── .flake8 ├── .github/ │ ├── ISSUE_TEMPLATE/ │ │ ├── bug-report.md │ │ ├── feature-request.md │ │ └── questions-help-support.md │ ├── PULL_REQUEST_TEMPLATE.md │ ├── actions/ │ │ ├── setup-build-cuda/ │ │ │ └── action.yml │ │ └── setup-env-build/ │ │ └── action.yml │ ├── compute_wheel_version.py │ ├── gpu_benchmark_diff.py │ ├── run-clang-format.py │ ├── run_benchmark_wrapper.py │ ├── selective_ci/ │ │ ├── requirements.txt │ │ └── selective_ci.py │ └── workflows/ │ ├── gh-pages.yml │ ├── gpu_test_gh.yml │ ├── linters.yml │ ├── linters_reusable.yml │ ├── rocm_build.yml │ ├── rocm_ci.yml │ ├── rocm_docker.yml │ ├── wheels.yml │ ├── wheels_build.yml │ ├── wheels_upload_pip.yml │ ├── wheels_upload_s3.yml │ └── win-build.yml ├── .gitignore ├── .gitmodules ├── .isort.cfg ├── .markdownlint.json ├── .pre-commit-config.yaml ├── .pyre_configuration ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── docs/ │ ├── Makefile │ ├── requirements.txt │ └── source/ │ ├── 2d_attention_patterns.ipynb │ ├── _static/ │ │ └── css/ │ │ └── customize.css │ ├── _templates/ │ │ ├── layout.html │ │ └── theme_variables.jinja │ ├── components/ │ │ ├── index.rst │ │ └── ops.rst │ ├── conf.py │ ├── index.rst │ ├── swin_transformer.ipynb │ └── what_is_xformers.rst ├── examples/ │ └── llama_inference/ │ ├── README.md │ ├── generate.py │ ├── model.py │ ├── mp_utils.py │ ├── requirements.txt │ ├── sample_utils.py │ ├── stats.py │ └── tokenizer.py ├── pyproject.toml ├── requirements-benchmark.txt ├── requirements-test.txt ├── requirements.txt ├── setup.cfg ├── setup.py ├── stubs/ │ ├── fvcore/ │ │ └── nn.pyi │ ├── matplotlib/ │ │ └── pyplot.pyi │ ├── numpy/ │ │ └── __init__.pyi │ ├── pandas.pyi │ ├── recommonmark/ │ │ └── transform.pyi │ ├── seaborn.pyi │ ├── sklearn/ │ │ └── model_selection.pyi │ ├── submitit.pyi │ ├── tensorflow.pyi │ ├── torch/ │ │ ├── __init__.pyi │ │ ├── autograd/ │ │ │ ├── __init__.pyi │ │ │ └── profiler.pyi │ │ ├── cuda/ │ │ │ └── __init__.pyi │ │ ├── fft/ │ │ │ └── __init__.pyi │ │ ├── hub.pyi │ │ ├── linalg/ │ │ │ └── __init__.pyi │ │ ├── nn/ │ │ │ ├── __init__.pyi │ │ │ ├── functional/ │ │ │ │ └── __init__.pyi │ │ │ ├── functional.pyi │ │ │ ├── init.pyi │ │ │ └── utils/ │ │ │ └── __init__.pyi │ │ ├── onnx.pyi │ │ ├── ops.pyi │ │ ├── optim/ │ │ │ └── __init__.pyi │ │ ├── profiler/ │ │ │ └── __init__.pyi │ │ ├── random/ │ │ │ └── __init__.pyi │ │ ├── sparse/ │ │ │ └── __init__.pyi │ │ └── utils/ │ │ ├── data.pyi │ │ └── model_zoo.pyi │ ├── torch_stub_tests.py │ ├── tqdm.pyi │ └── triton/ │ ├── __init__.pyi │ ├── language.pyi │ └── ops/ │ └── blocksparse.pyi ├── tests/ │ ├── __init__.py │ ├── multiprocessing_utils.py │ ├── readme_test_on_rocm.txt │ ├── test_attention_patterns.py │ ├── test_checkpoint.py │ ├── test_fmha_flop_formula.py │ ├── test_fmha_merge_attentions.py │ ├── test_fwbw_overlap.py │ ├── test_indexing.py │ ├── test_mem_eff_attention.py │ ├── test_multiprocessing_utils.py │ ├── test_profiler.py │ ├── test_rmsnorm.py │ ├── test_rope_padded.py │ ├── test_seqpar.py │ ├── test_sequence_parallel_fused_ops.py │ ├── test_sparse_tensors.py │ ├── test_sparsity24.py │ ├── test_splitk_reference.py │ ├── test_tiled_matmul.py │ ├── test_tree_attention.py │ ├── test_triton_varargs.py │ ├── test_unbind.py │ └── utils.py ├── version.txt └── xformers/ ├── __init__.py ├── _cpp_lib.py ├── _deprecation_warning.py ├── attn_bias_utils.py ├── benchmarks/ │ ├── __init__.py │ ├── benchmark_attn_decoding.py │ ├── benchmark_indexing.py │ ├── benchmark_mem_eff_attention.py │ ├── benchmark_merge_attentions.py │ ├── benchmark_sequence_parallel_fused.py │ ├── benchmark_sp24.py │ ├── benchmark_tiled_matmul.py │ ├── readme_benchmark_on_rocm.txt │ └── utils.py ├── checkpoint.py ├── components/ │ └── attention/ │ └── attention_patterns.py ├── csrc/ │ ├── attention/ │ │ ├── attention.cpp │ │ ├── hip_decoder/ │ │ │ ├── CMakeLists.txt │ │ │ ├── attention_forward_splitk.cpp │ │ │ ├── ck_tile_attention_forward_decoder_splitk.h │ │ │ └── ck_tile_attention_inner_product.h │ │ └── hip_fmha/ │ │ ├── GENERATE_INSTANCES.md │ │ ├── attention_backward_generic_ck_tiled.cpp │ │ ├── attention_ck_rand_uniform.cpp │ │ ├── attention_forward_generic_ck_tiled.cpp │ │ ├── ck_fmha_test.cpp │ │ ├── ck_fmha_util.h │ │ ├── ck_tiled_bool_switch.h │ │ ├── ck_tiled_fmha_batched_backward.h │ │ ├── ck_tiled_fmha_batched_backward_bf16.cpp │ │ ├── ck_tiled_fmha_batched_backward_fp16.cpp │ │ ├── ck_tiled_fmha_batched_forward.h │ │ ├── ck_tiled_fmha_batched_forward_bf16.cpp │ │ ├── ck_tiled_fmha_batched_forward_dispatch.h │ │ ├── ck_tiled_fmha_batched_forward_fp16.cpp │ │ ├── ck_tiled_fmha_batched_forward_splitkv_dispatch.h │ │ ├── ck_tiled_fmha_batched_forward_splitkv_smallq_dispatch.h │ │ ├── ck_tiled_fmha_batched_infer.h │ │ ├── ck_tiled_fmha_batched_infer_bf16.cpp │ │ ├── ck_tiled_fmha_batched_infer_dispatch.h │ │ ├── ck_tiled_fmha_batched_infer_fp16.cpp │ │ ├── ck_tiled_fmha_batched_infer_splitkv_dispatch.h │ │ ├── ck_tiled_fmha_batched_infer_splitkv_smallq_dispatch.h │ │ ├── ck_tiled_fmha_bwd_setting.h │ │ ├── ck_tiled_fmha_fwd_setting.h │ │ ├── ck_tiled_fmha_fwd_splitkv_selector.h │ │ ├── ck_tiled_fmha_fwd_splitkv_setting.h │ │ ├── ck_tiled_fmha_fwd_splitkv_smallq_selector.h │ │ ├── ck_tiled_fmha_fwd_splitkv_smallq_setting.h │ │ ├── ck_tiled_fmha_fwd_type_config.h │ │ ├── ck_tiled_fmha_grouped_backward.h │ │ ├── ck_tiled_fmha_grouped_backward_bf16.cpp │ │ ├── ck_tiled_fmha_grouped_backward_fp16.cpp │ │ ├── ck_tiled_fmha_grouped_forward.h │ │ ├── ck_tiled_fmha_grouped_forward_bf16.cpp │ │ ├── ck_tiled_fmha_grouped_forward_dispatch.h │ │ ├── ck_tiled_fmha_grouped_forward_fp16.cpp │ │ ├── ck_tiled_fmha_grouped_forward_splitkv_dispatch.h │ │ ├── ck_tiled_fmha_grouped_forward_splitkv_smallq_dispatch.h │ │ ├── ck_tiled_fmha_grouped_infer.h │ │ ├── ck_tiled_fmha_grouped_infer_bf16.cpp │ │ ├── ck_tiled_fmha_grouped_infer_dispatch.h │ │ ├── ck_tiled_fmha_grouped_infer_fp16.cpp │ │ ├── ck_tiled_fmha_grouped_infer_splitkv_dispatch.h │ │ ├── ck_tiled_fmha_grouped_infer_splitkv_smallq_dispatch.h │ │ ├── ck_tiled_fmha_num_kv_split_switch.h │ │ ├── ck_tiled_fmha_params.h │ │ ├── ck_tiled_fmha_seqlen_q_switch.h │ │ ├── ck_tiled_headdim_switch.h │ │ ├── ck_tiled_rand_uniform_kernel.h │ │ ├── generate_instances.py │ │ └── instances/ │ │ ├── fmha_batched_backward_bf16_has_mask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp │ │ ├── fmha_batched_backward_bf16_has_mask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp │ │ ├── fmha_batched_backward_bf16_has_mask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp │ │ ├── fmha_batched_backward_bf16_has_mask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp │ │ ├── fmha_batched_backward_bf16_has_mask_has_bias_has_biasgrad_has_dropout_maxk_96.cpp │ │ ├── fmha_batched_backward_bf16_has_mask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp │ │ ├── fmha_batched_backward_bf16_has_mask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp │ │ ├── fmha_batched_backward_bf16_has_mask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp │ │ ├── fmha_batched_backward_bf16_has_mask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp │ │ ├── fmha_batched_backward_bf16_has_mask_has_bias_has_biasgrad_no_dropout_maxk_96.cpp │ │ ├── fmha_batched_backward_bf16_has_mask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp │ │ ├── fmha_batched_backward_bf16_has_mask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp │ │ ├── fmha_batched_backward_bf16_has_mask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp │ │ ├── fmha_batched_backward_bf16_has_mask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp │ │ ├── fmha_batched_backward_bf16_has_mask_has_bias_no_biasgrad_has_dropout_maxk_96.cpp │ │ ├── fmha_batched_backward_bf16_has_mask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp │ │ ├── fmha_batched_backward_bf16_has_mask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp │ │ ├── fmha_batched_backward_bf16_has_mask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp │ │ ├── fmha_batched_backward_bf16_has_mask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp │ │ ├── fmha_batched_backward_bf16_has_mask_has_bias_no_biasgrad_no_dropout_maxk_96.cpp │ │ ├── fmha_batched_backward_bf16_has_mask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp │ │ ├── fmha_batched_backward_bf16_has_mask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp │ │ ├── fmha_batched_backward_bf16_has_mask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp │ │ ├── fmha_batched_backward_bf16_has_mask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp │ │ ├── fmha_batched_backward_bf16_has_mask_no_bias_no_biasgrad_has_dropout_maxk_96.cpp │ │ ├── fmha_batched_backward_bf16_has_mask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp │ │ ├── fmha_batched_backward_bf16_has_mask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp │ │ ├── fmha_batched_backward_bf16_has_mask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp │ │ ├── fmha_batched_backward_bf16_has_mask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp │ │ ├── fmha_batched_backward_bf16_has_mask_no_bias_no_biasgrad_no_dropout_maxk_96.cpp │ │ ├── fmha_batched_backward_bf16_instances_ref.h │ │ ├── fmha_batched_backward_bf16_no_mask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp │ │ ├── fmha_batched_backward_bf16_no_mask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp │ │ ├── fmha_batched_backward_bf16_no_mask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp │ │ ├── fmha_batched_backward_bf16_no_mask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp │ │ ├── fmha_batched_backward_bf16_no_mask_has_bias_has_biasgrad_has_dropout_maxk_96.cpp │ │ ├── fmha_batched_backward_bf16_no_mask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp │ │ ├── fmha_batched_backward_bf16_no_mask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp │ │ ├── fmha_batched_backward_bf16_no_mask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp │ │ ├── fmha_batched_backward_bf16_no_mask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp │ │ ├── fmha_batched_backward_bf16_no_mask_has_bias_has_biasgrad_no_dropout_maxk_96.cpp │ │ ├── fmha_batched_backward_bf16_no_mask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp │ │ ├── fmha_batched_backward_bf16_no_mask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp │ │ ├── fmha_batched_backward_bf16_no_mask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp │ │ ├── fmha_batched_backward_bf16_no_mask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp │ │ ├── fmha_batched_backward_bf16_no_mask_has_bias_no_biasgrad_has_dropout_maxk_96.cpp │ │ ├── fmha_batched_backward_bf16_no_mask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp │ │ ├── fmha_batched_backward_bf16_no_mask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp │ │ ├── fmha_batched_backward_bf16_no_mask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp │ │ ├── fmha_batched_backward_bf16_no_mask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp │ │ ├── fmha_batched_backward_bf16_no_mask_has_bias_no_biasgrad_no_dropout_maxk_96.cpp │ │ ├── fmha_batched_backward_bf16_no_mask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp │ │ ├── fmha_batched_backward_bf16_no_mask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp │ │ ├── fmha_batched_backward_bf16_no_mask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp │ │ ├── fmha_batched_backward_bf16_no_mask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp │ │ ├── fmha_batched_backward_bf16_no_mask_no_bias_no_biasgrad_has_dropout_maxk_96.cpp │ │ ├── fmha_batched_backward_bf16_no_mask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp │ │ ├── fmha_batched_backward_bf16_no_mask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp │ │ ├── fmha_batched_backward_bf16_no_mask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp │ │ ├── fmha_batched_backward_bf16_no_mask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp │ │ ├── fmha_batched_backward_bf16_no_mask_no_bias_no_biasgrad_no_dropout_maxk_96.cpp │ │ ├── fmha_batched_backward_fp16_has_mask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp │ │ ├── fmha_batched_backward_fp16_has_mask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp │ │ ├── fmha_batched_backward_fp16_has_mask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp │ │ ├── fmha_batched_backward_fp16_has_mask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp │ │ ├── fmha_batched_backward_fp16_has_mask_has_bias_has_biasgrad_has_dropout_maxk_96.cpp │ │ ├── fmha_batched_backward_fp16_has_mask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp │ │ ├── fmha_batched_backward_fp16_has_mask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp │ │ ├── fmha_batched_backward_fp16_has_mask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp │ │ ├── fmha_batched_backward_fp16_has_mask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp │ │ ├── fmha_batched_backward_fp16_has_mask_has_bias_has_biasgrad_no_dropout_maxk_96.cpp │ │ ├── fmha_batched_backward_fp16_has_mask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp │ │ ├── fmha_batched_backward_fp16_has_mask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp │ │ ├── fmha_batched_backward_fp16_has_mask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp │ │ ├── fmha_batched_backward_fp16_has_mask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp │ │ ├── fmha_batched_backward_fp16_has_mask_has_bias_no_biasgrad_has_dropout_maxk_96.cpp │ │ ├── fmha_batched_backward_fp16_has_mask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp │ │ ├── fmha_batched_backward_fp16_has_mask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp │ │ ├── fmha_batched_backward_fp16_has_mask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp │ │ ├── fmha_batched_backward_fp16_has_mask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp │ │ ├── fmha_batched_backward_fp16_has_mask_has_bias_no_biasgrad_no_dropout_maxk_96.cpp │ │ ├── fmha_batched_backward_fp16_has_mask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp │ │ ├── fmha_batched_backward_fp16_has_mask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp │ │ ├── fmha_batched_backward_fp16_has_mask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp │ │ ├── fmha_batched_backward_fp16_has_mask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp │ │ ├── fmha_batched_backward_fp16_has_mask_no_bias_no_biasgrad_has_dropout_maxk_96.cpp │ │ ├── fmha_batched_backward_fp16_has_mask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp │ │ ├── fmha_batched_backward_fp16_has_mask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp │ │ ├── fmha_batched_backward_fp16_has_mask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp │ │ ├── fmha_batched_backward_fp16_has_mask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp │ │ ├── fmha_batched_backward_fp16_has_mask_no_bias_no_biasgrad_no_dropout_maxk_96.cpp │ │ ├── fmha_batched_backward_fp16_instances_ref.h │ │ ├── fmha_batched_backward_fp16_no_mask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp │ │ ├── fmha_batched_backward_fp16_no_mask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp │ │ ├── fmha_batched_backward_fp16_no_mask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp │ │ ├── fmha_batched_backward_fp16_no_mask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp │ │ ├── fmha_batched_backward_fp16_no_mask_has_bias_has_biasgrad_has_dropout_maxk_96.cpp │ │ ├── fmha_batched_backward_fp16_no_mask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp │ │ ├── fmha_batched_backward_fp16_no_mask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp │ │ ├── fmha_batched_backward_fp16_no_mask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp │ │ ├── fmha_batched_backward_fp16_no_mask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp │ │ ├── fmha_batched_backward_fp16_no_mask_has_bias_has_biasgrad_no_dropout_maxk_96.cpp │ │ ├── fmha_batched_backward_fp16_no_mask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp │ │ ├── fmha_batched_backward_fp16_no_mask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp │ │ ├── fmha_batched_backward_fp16_no_mask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp │ │ ├── fmha_batched_backward_fp16_no_mask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp │ │ ├── fmha_batched_backward_fp16_no_mask_has_bias_no_biasgrad_has_dropout_maxk_96.cpp │ │ ├── fmha_batched_backward_fp16_no_mask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp │ │ ├── fmha_batched_backward_fp16_no_mask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp │ │ ├── fmha_batched_backward_fp16_no_mask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp │ │ ├── fmha_batched_backward_fp16_no_mask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp │ │ ├── fmha_batched_backward_fp16_no_mask_has_bias_no_biasgrad_no_dropout_maxk_96.cpp │ │ ├── fmha_batched_backward_fp16_no_mask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp │ │ ├── fmha_batched_backward_fp16_no_mask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp │ │ ├── fmha_batched_backward_fp16_no_mask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp │ │ ├── fmha_batched_backward_fp16_no_mask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp │ │ ├── fmha_batched_backward_fp16_no_mask_no_bias_no_biasgrad_has_dropout_maxk_96.cpp │ │ ├── fmha_batched_backward_fp16_no_mask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp │ │ ├── fmha_batched_backward_fp16_no_mask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp │ │ ├── fmha_batched_backward_fp16_no_mask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp │ │ ├── fmha_batched_backward_fp16_no_mask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp │ │ ├── fmha_batched_backward_fp16_no_mask_no_bias_no_biasgrad_no_dropout_maxk_96.cpp │ │ ├── fmha_batched_forward_bf16_has_mask_has_bias_has_dropout_maxk_128.cpp │ │ ├── fmha_batched_forward_bf16_has_mask_has_bias_has_dropout_maxk_256.cpp │ │ ├── fmha_batched_forward_bf16_has_mask_has_bias_has_dropout_maxk_32.cpp │ │ ├── fmha_batched_forward_bf16_has_mask_has_bias_has_dropout_maxk_512.cpp │ │ ├── fmha_batched_forward_bf16_has_mask_has_bias_has_dropout_maxk_64.cpp │ │ ├── fmha_batched_forward_bf16_has_mask_has_bias_has_dropout_maxk_96.cpp │ │ ├── fmha_batched_forward_bf16_has_mask_has_bias_no_dropout_maxk_128.cpp │ │ ├── fmha_batched_forward_bf16_has_mask_has_bias_no_dropout_maxk_256.cpp │ │ ├── fmha_batched_forward_bf16_has_mask_has_bias_no_dropout_maxk_32.cpp │ │ ├── fmha_batched_forward_bf16_has_mask_has_bias_no_dropout_maxk_512.cpp │ │ ├── fmha_batched_forward_bf16_has_mask_has_bias_no_dropout_maxk_64.cpp │ │ ├── fmha_batched_forward_bf16_has_mask_has_bias_no_dropout_maxk_96.cpp │ │ ├── fmha_batched_forward_bf16_has_mask_no_bias_has_dropout_maxk_128.cpp │ │ ├── fmha_batched_forward_bf16_has_mask_no_bias_has_dropout_maxk_256.cpp │ │ ├── fmha_batched_forward_bf16_has_mask_no_bias_has_dropout_maxk_32.cpp │ │ ├── fmha_batched_forward_bf16_has_mask_no_bias_has_dropout_maxk_512.cpp │ │ ├── fmha_batched_forward_bf16_has_mask_no_bias_has_dropout_maxk_64.cpp │ │ ├── fmha_batched_forward_bf16_has_mask_no_bias_has_dropout_maxk_96.cpp │ │ ├── fmha_batched_forward_bf16_has_mask_no_bias_no_dropout_maxk_128.cpp │ │ ├── fmha_batched_forward_bf16_has_mask_no_bias_no_dropout_maxk_256.cpp │ │ ├── fmha_batched_forward_bf16_has_mask_no_bias_no_dropout_maxk_32.cpp │ │ ├── fmha_batched_forward_bf16_has_mask_no_bias_no_dropout_maxk_512.cpp │ │ ├── fmha_batched_forward_bf16_has_mask_no_bias_no_dropout_maxk_64.cpp │ │ ├── fmha_batched_forward_bf16_has_mask_no_bias_no_dropout_maxk_96.cpp │ │ ├── fmha_batched_forward_bf16_instances_ref.h │ │ ├── fmha_batched_forward_bf16_no_mask_has_bias_has_dropout_maxk_128.cpp │ │ ├── fmha_batched_forward_bf16_no_mask_has_bias_has_dropout_maxk_256.cpp │ │ ├── fmha_batched_forward_bf16_no_mask_has_bias_has_dropout_maxk_32.cpp │ │ ├── fmha_batched_forward_bf16_no_mask_has_bias_has_dropout_maxk_512.cpp │ │ ├── fmha_batched_forward_bf16_no_mask_has_bias_has_dropout_maxk_64.cpp │ │ ├── fmha_batched_forward_bf16_no_mask_has_bias_has_dropout_maxk_96.cpp │ │ ├── fmha_batched_forward_bf16_no_mask_has_bias_no_dropout_maxk_128.cpp │ │ ├── fmha_batched_forward_bf16_no_mask_has_bias_no_dropout_maxk_256.cpp │ │ ├── fmha_batched_forward_bf16_no_mask_has_bias_no_dropout_maxk_32.cpp │ │ ├── fmha_batched_forward_bf16_no_mask_has_bias_no_dropout_maxk_512.cpp │ │ ├── fmha_batched_forward_bf16_no_mask_has_bias_no_dropout_maxk_64.cpp │ │ ├── fmha_batched_forward_bf16_no_mask_has_bias_no_dropout_maxk_96.cpp │ │ ├── fmha_batched_forward_bf16_no_mask_no_bias_has_dropout_maxk_128.cpp │ │ ├── fmha_batched_forward_bf16_no_mask_no_bias_has_dropout_maxk_256.cpp │ │ ├── fmha_batched_forward_bf16_no_mask_no_bias_has_dropout_maxk_32.cpp │ │ ├── fmha_batched_forward_bf16_no_mask_no_bias_has_dropout_maxk_512.cpp │ │ ├── fmha_batched_forward_bf16_no_mask_no_bias_has_dropout_maxk_64.cpp │ │ ├── fmha_batched_forward_bf16_no_mask_no_bias_has_dropout_maxk_96.cpp │ │ ├── fmha_batched_forward_bf16_no_mask_no_bias_no_dropout_maxk_128.cpp │ │ ├── fmha_batched_forward_bf16_no_mask_no_bias_no_dropout_maxk_256.cpp │ │ ├── fmha_batched_forward_bf16_no_mask_no_bias_no_dropout_maxk_32.cpp │ │ ├── fmha_batched_forward_bf16_no_mask_no_bias_no_dropout_maxk_512.cpp │ │ ├── fmha_batched_forward_bf16_no_mask_no_bias_no_dropout_maxk_64.cpp │ │ ├── fmha_batched_forward_bf16_no_mask_no_bias_no_dropout_maxk_96.cpp │ │ ├── fmha_batched_forward_fp16_has_mask_has_bias_has_dropout_maxk_128.cpp │ │ ├── fmha_batched_forward_fp16_has_mask_has_bias_has_dropout_maxk_256.cpp │ │ ├── fmha_batched_forward_fp16_has_mask_has_bias_has_dropout_maxk_32.cpp │ │ ├── fmha_batched_forward_fp16_has_mask_has_bias_has_dropout_maxk_512.cpp │ │ ├── fmha_batched_forward_fp16_has_mask_has_bias_has_dropout_maxk_64.cpp │ │ ├── fmha_batched_forward_fp16_has_mask_has_bias_has_dropout_maxk_96.cpp │ │ ├── fmha_batched_forward_fp16_has_mask_has_bias_no_dropout_maxk_128.cpp │ │ ├── fmha_batched_forward_fp16_has_mask_has_bias_no_dropout_maxk_256.cpp │ │ ├── fmha_batched_forward_fp16_has_mask_has_bias_no_dropout_maxk_32.cpp │ │ ├── fmha_batched_forward_fp16_has_mask_has_bias_no_dropout_maxk_512.cpp │ │ ├── fmha_batched_forward_fp16_has_mask_has_bias_no_dropout_maxk_64.cpp │ │ ├── fmha_batched_forward_fp16_has_mask_has_bias_no_dropout_maxk_96.cpp │ │ ├── fmha_batched_forward_fp16_has_mask_no_bias_has_dropout_maxk_128.cpp │ │ ├── fmha_batched_forward_fp16_has_mask_no_bias_has_dropout_maxk_256.cpp │ │ ├── fmha_batched_forward_fp16_has_mask_no_bias_has_dropout_maxk_32.cpp │ │ ├── fmha_batched_forward_fp16_has_mask_no_bias_has_dropout_maxk_512.cpp │ │ ├── fmha_batched_forward_fp16_has_mask_no_bias_has_dropout_maxk_64.cpp │ │ ├── fmha_batched_forward_fp16_has_mask_no_bias_has_dropout_maxk_96.cpp │ │ ├── fmha_batched_forward_fp16_has_mask_no_bias_no_dropout_maxk_128.cpp │ │ ├── fmha_batched_forward_fp16_has_mask_no_bias_no_dropout_maxk_256.cpp │ │ ├── fmha_batched_forward_fp16_has_mask_no_bias_no_dropout_maxk_32.cpp │ │ ├── fmha_batched_forward_fp16_has_mask_no_bias_no_dropout_maxk_512.cpp │ │ ├── fmha_batched_forward_fp16_has_mask_no_bias_no_dropout_maxk_64.cpp │ │ ├── fmha_batched_forward_fp16_has_mask_no_bias_no_dropout_maxk_96.cpp │ │ ├── fmha_batched_forward_fp16_instances_ref.h │ │ ├── fmha_batched_forward_fp16_no_mask_has_bias_has_dropout_maxk_128.cpp │ │ ├── fmha_batched_forward_fp16_no_mask_has_bias_has_dropout_maxk_256.cpp │ │ ├── fmha_batched_forward_fp16_no_mask_has_bias_has_dropout_maxk_32.cpp │ │ ├── fmha_batched_forward_fp16_no_mask_has_bias_has_dropout_maxk_512.cpp │ │ ├── fmha_batched_forward_fp16_no_mask_has_bias_has_dropout_maxk_64.cpp │ │ ├── fmha_batched_forward_fp16_no_mask_has_bias_has_dropout_maxk_96.cpp │ │ ├── fmha_batched_forward_fp16_no_mask_has_bias_no_dropout_maxk_128.cpp │ │ ├── fmha_batched_forward_fp16_no_mask_has_bias_no_dropout_maxk_256.cpp │ │ ├── fmha_batched_forward_fp16_no_mask_has_bias_no_dropout_maxk_32.cpp │ │ ├── fmha_batched_forward_fp16_no_mask_has_bias_no_dropout_maxk_512.cpp │ │ ├── fmha_batched_forward_fp16_no_mask_has_bias_no_dropout_maxk_64.cpp │ │ ├── fmha_batched_forward_fp16_no_mask_has_bias_no_dropout_maxk_96.cpp │ │ ├── fmha_batched_forward_fp16_no_mask_no_bias_has_dropout_maxk_128.cpp │ │ ├── fmha_batched_forward_fp16_no_mask_no_bias_has_dropout_maxk_256.cpp │ │ ├── fmha_batched_forward_fp16_no_mask_no_bias_has_dropout_maxk_32.cpp │ │ ├── fmha_batched_forward_fp16_no_mask_no_bias_has_dropout_maxk_512.cpp │ │ ├── fmha_batched_forward_fp16_no_mask_no_bias_has_dropout_maxk_64.cpp │ │ ├── fmha_batched_forward_fp16_no_mask_no_bias_has_dropout_maxk_96.cpp │ │ ├── fmha_batched_forward_fp16_no_mask_no_bias_no_dropout_maxk_128.cpp │ │ ├── fmha_batched_forward_fp16_no_mask_no_bias_no_dropout_maxk_256.cpp │ │ ├── fmha_batched_forward_fp16_no_mask_no_bias_no_dropout_maxk_32.cpp │ │ ├── fmha_batched_forward_fp16_no_mask_no_bias_no_dropout_maxk_512.cpp │ │ ├── fmha_batched_forward_fp16_no_mask_no_bias_no_dropout_maxk_64.cpp │ │ ├── fmha_batched_forward_fp16_no_mask_no_bias_no_dropout_maxk_96.cpp │ │ ├── fmha_batched_infer_bf16_has_mask_has_bias_has_dropout_maxk_128.cpp │ │ ├── fmha_batched_infer_bf16_has_mask_has_bias_has_dropout_maxk_256.cpp │ │ ├── fmha_batched_infer_bf16_has_mask_has_bias_has_dropout_maxk_32.cpp │ │ ├── fmha_batched_infer_bf16_has_mask_has_bias_has_dropout_maxk_512.cpp │ │ ├── fmha_batched_infer_bf16_has_mask_has_bias_has_dropout_maxk_64.cpp │ │ ├── fmha_batched_infer_bf16_has_mask_has_bias_has_dropout_maxk_96.cpp │ │ ├── fmha_batched_infer_bf16_has_mask_has_bias_no_dropout_maxk_128.cpp │ │ ├── fmha_batched_infer_bf16_has_mask_has_bias_no_dropout_maxk_256.cpp │ │ ├── fmha_batched_infer_bf16_has_mask_has_bias_no_dropout_maxk_32.cpp │ │ ├── fmha_batched_infer_bf16_has_mask_has_bias_no_dropout_maxk_512.cpp │ │ ├── fmha_batched_infer_bf16_has_mask_has_bias_no_dropout_maxk_64.cpp │ │ ├── fmha_batched_infer_bf16_has_mask_has_bias_no_dropout_maxk_96.cpp │ │ ├── fmha_batched_infer_bf16_has_mask_no_bias_has_dropout_maxk_128.cpp │ │ ├── fmha_batched_infer_bf16_has_mask_no_bias_has_dropout_maxk_256.cpp │ │ ├── fmha_batched_infer_bf16_has_mask_no_bias_has_dropout_maxk_32.cpp │ │ ├── fmha_batched_infer_bf16_has_mask_no_bias_has_dropout_maxk_512.cpp │ │ ├── fmha_batched_infer_bf16_has_mask_no_bias_has_dropout_maxk_64.cpp │ │ ├── fmha_batched_infer_bf16_has_mask_no_bias_has_dropout_maxk_96.cpp │ │ ├── fmha_batched_infer_bf16_has_mask_no_bias_no_dropout_maxk_128.cpp │ │ ├── fmha_batched_infer_bf16_has_mask_no_bias_no_dropout_maxk_256.cpp │ │ ├── fmha_batched_infer_bf16_has_mask_no_bias_no_dropout_maxk_32.cpp │ │ ├── fmha_batched_infer_bf16_has_mask_no_bias_no_dropout_maxk_512.cpp │ │ ├── fmha_batched_infer_bf16_has_mask_no_bias_no_dropout_maxk_64.cpp │ │ ├── fmha_batched_infer_bf16_has_mask_no_bias_no_dropout_maxk_96.cpp │ │ ├── fmha_batched_infer_bf16_instances_ref.h │ │ ├── fmha_batched_infer_bf16_no_mask_has_bias_has_dropout_maxk_128.cpp │ │ ├── fmha_batched_infer_bf16_no_mask_has_bias_has_dropout_maxk_256.cpp │ │ ├── fmha_batched_infer_bf16_no_mask_has_bias_has_dropout_maxk_32.cpp │ │ ├── fmha_batched_infer_bf16_no_mask_has_bias_has_dropout_maxk_512.cpp │ │ ├── fmha_batched_infer_bf16_no_mask_has_bias_has_dropout_maxk_64.cpp │ │ ├── fmha_batched_infer_bf16_no_mask_has_bias_has_dropout_maxk_96.cpp │ │ ├── fmha_batched_infer_bf16_no_mask_has_bias_no_dropout_maxk_128.cpp │ │ ├── fmha_batched_infer_bf16_no_mask_has_bias_no_dropout_maxk_256.cpp │ │ ├── fmha_batched_infer_bf16_no_mask_has_bias_no_dropout_maxk_32.cpp │ │ ├── fmha_batched_infer_bf16_no_mask_has_bias_no_dropout_maxk_512.cpp │ │ ├── fmha_batched_infer_bf16_no_mask_has_bias_no_dropout_maxk_64.cpp │ │ ├── fmha_batched_infer_bf16_no_mask_has_bias_no_dropout_maxk_96.cpp │ │ ├── fmha_batched_infer_bf16_no_mask_no_bias_has_dropout_maxk_128.cpp │ │ ├── fmha_batched_infer_bf16_no_mask_no_bias_has_dropout_maxk_256.cpp │ │ ├── fmha_batched_infer_bf16_no_mask_no_bias_has_dropout_maxk_32.cpp │ │ ├── fmha_batched_infer_bf16_no_mask_no_bias_has_dropout_maxk_512.cpp │ │ ├── fmha_batched_infer_bf16_no_mask_no_bias_has_dropout_maxk_64.cpp │ │ ├── fmha_batched_infer_bf16_no_mask_no_bias_has_dropout_maxk_96.cpp │ │ ├── fmha_batched_infer_bf16_no_mask_no_bias_no_dropout_maxk_128.cpp │ │ ├── fmha_batched_infer_bf16_no_mask_no_bias_no_dropout_maxk_256.cpp │ │ ├── fmha_batched_infer_bf16_no_mask_no_bias_no_dropout_maxk_32.cpp │ │ ├── fmha_batched_infer_bf16_no_mask_no_bias_no_dropout_maxk_512.cpp │ │ ├── fmha_batched_infer_bf16_no_mask_no_bias_no_dropout_maxk_64.cpp │ │ ├── fmha_batched_infer_bf16_no_mask_no_bias_no_dropout_maxk_96.cpp │ │ ├── fmha_batched_infer_fp16_has_mask_has_bias_has_dropout_maxk_128.cpp │ │ ├── fmha_batched_infer_fp16_has_mask_has_bias_has_dropout_maxk_256.cpp │ │ ├── fmha_batched_infer_fp16_has_mask_has_bias_has_dropout_maxk_32.cpp │ │ ├── fmha_batched_infer_fp16_has_mask_has_bias_has_dropout_maxk_512.cpp │ │ ├── fmha_batched_infer_fp16_has_mask_has_bias_has_dropout_maxk_64.cpp │ │ ├── fmha_batched_infer_fp16_has_mask_has_bias_has_dropout_maxk_96.cpp │ │ ├── fmha_batched_infer_fp16_has_mask_has_bias_no_dropout_maxk_128.cpp │ │ ├── fmha_batched_infer_fp16_has_mask_has_bias_no_dropout_maxk_256.cpp │ │ ├── fmha_batched_infer_fp16_has_mask_has_bias_no_dropout_maxk_32.cpp │ │ ├── fmha_batched_infer_fp16_has_mask_has_bias_no_dropout_maxk_512.cpp │ │ ├── fmha_batched_infer_fp16_has_mask_has_bias_no_dropout_maxk_64.cpp │ │ ├── fmha_batched_infer_fp16_has_mask_has_bias_no_dropout_maxk_96.cpp │ │ ├── fmha_batched_infer_fp16_has_mask_no_bias_has_dropout_maxk_128.cpp │ │ ├── fmha_batched_infer_fp16_has_mask_no_bias_has_dropout_maxk_256.cpp │ │ ├── fmha_batched_infer_fp16_has_mask_no_bias_has_dropout_maxk_32.cpp │ │ ├── fmha_batched_infer_fp16_has_mask_no_bias_has_dropout_maxk_512.cpp │ │ ├── fmha_batched_infer_fp16_has_mask_no_bias_has_dropout_maxk_64.cpp │ │ ├── fmha_batched_infer_fp16_has_mask_no_bias_has_dropout_maxk_96.cpp │ │ ├── fmha_batched_infer_fp16_has_mask_no_bias_no_dropout_maxk_128.cpp │ │ ├── fmha_batched_infer_fp16_has_mask_no_bias_no_dropout_maxk_256.cpp │ │ ├── fmha_batched_infer_fp16_has_mask_no_bias_no_dropout_maxk_32.cpp │ │ ├── fmha_batched_infer_fp16_has_mask_no_bias_no_dropout_maxk_512.cpp │ │ ├── fmha_batched_infer_fp16_has_mask_no_bias_no_dropout_maxk_64.cpp │ │ ├── fmha_batched_infer_fp16_has_mask_no_bias_no_dropout_maxk_96.cpp │ │ ├── fmha_batched_infer_fp16_instances_ref.h │ │ ├── fmha_batched_infer_fp16_no_mask_has_bias_has_dropout_maxk_128.cpp │ │ ├── fmha_batched_infer_fp16_no_mask_has_bias_has_dropout_maxk_256.cpp │ │ ├── fmha_batched_infer_fp16_no_mask_has_bias_has_dropout_maxk_32.cpp │ │ ├── fmha_batched_infer_fp16_no_mask_has_bias_has_dropout_maxk_512.cpp │ │ ├── fmha_batched_infer_fp16_no_mask_has_bias_has_dropout_maxk_64.cpp │ │ ├── fmha_batched_infer_fp16_no_mask_has_bias_has_dropout_maxk_96.cpp │ │ ├── fmha_batched_infer_fp16_no_mask_has_bias_no_dropout_maxk_128.cpp │ │ ├── fmha_batched_infer_fp16_no_mask_has_bias_no_dropout_maxk_256.cpp │ │ ├── fmha_batched_infer_fp16_no_mask_has_bias_no_dropout_maxk_32.cpp │ │ ├── fmha_batched_infer_fp16_no_mask_has_bias_no_dropout_maxk_512.cpp │ │ ├── fmha_batched_infer_fp16_no_mask_has_bias_no_dropout_maxk_64.cpp │ │ ├── fmha_batched_infer_fp16_no_mask_has_bias_no_dropout_maxk_96.cpp │ │ ├── fmha_batched_infer_fp16_no_mask_no_bias_has_dropout_maxk_128.cpp │ │ ├── fmha_batched_infer_fp16_no_mask_no_bias_has_dropout_maxk_256.cpp │ │ ├── fmha_batched_infer_fp16_no_mask_no_bias_has_dropout_maxk_32.cpp │ │ ├── fmha_batched_infer_fp16_no_mask_no_bias_has_dropout_maxk_512.cpp │ │ ├── fmha_batched_infer_fp16_no_mask_no_bias_has_dropout_maxk_64.cpp │ │ ├── fmha_batched_infer_fp16_no_mask_no_bias_has_dropout_maxk_96.cpp │ │ ├── fmha_batched_infer_fp16_no_mask_no_bias_no_dropout_maxk_128.cpp │ │ ├── fmha_batched_infer_fp16_no_mask_no_bias_no_dropout_maxk_256.cpp │ │ ├── fmha_batched_infer_fp16_no_mask_no_bias_no_dropout_maxk_32.cpp │ │ ├── fmha_batched_infer_fp16_no_mask_no_bias_no_dropout_maxk_512.cpp │ │ ├── fmha_batched_infer_fp16_no_mask_no_bias_no_dropout_maxk_64.cpp │ │ ├── fmha_batched_infer_fp16_no_mask_no_bias_no_dropout_maxk_96.cpp │ │ ├── fmha_grouped_backward_bf16_has_mask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp │ │ ├── fmha_grouped_backward_bf16_has_mask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp │ │ ├── fmha_grouped_backward_bf16_has_mask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp │ │ ├── fmha_grouped_backward_bf16_has_mask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp │ │ ├── fmha_grouped_backward_bf16_has_mask_has_bias_has_biasgrad_has_dropout_maxk_96.cpp │ │ ├── fmha_grouped_backward_bf16_has_mask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp │ │ ├── fmha_grouped_backward_bf16_has_mask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp │ │ ├── fmha_grouped_backward_bf16_has_mask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp │ │ ├── fmha_grouped_backward_bf16_has_mask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp │ │ ├── fmha_grouped_backward_bf16_has_mask_has_bias_has_biasgrad_no_dropout_maxk_96.cpp │ │ ├── fmha_grouped_backward_bf16_has_mask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp │ │ ├── fmha_grouped_backward_bf16_has_mask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp │ │ ├── fmha_grouped_backward_bf16_has_mask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp │ │ ├── fmha_grouped_backward_bf16_has_mask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp │ │ ├── fmha_grouped_backward_bf16_has_mask_has_bias_no_biasgrad_has_dropout_maxk_96.cpp │ │ ├── fmha_grouped_backward_bf16_has_mask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp │ │ ├── fmha_grouped_backward_bf16_has_mask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp │ │ ├── fmha_grouped_backward_bf16_has_mask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp │ │ ├── fmha_grouped_backward_bf16_has_mask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp │ │ ├── fmha_grouped_backward_bf16_has_mask_has_bias_no_biasgrad_no_dropout_maxk_96.cpp │ │ ├── fmha_grouped_backward_bf16_has_mask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp │ │ ├── fmha_grouped_backward_bf16_has_mask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp │ │ ├── fmha_grouped_backward_bf16_has_mask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp │ │ ├── fmha_grouped_backward_bf16_has_mask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp │ │ ├── fmha_grouped_backward_bf16_has_mask_no_bias_no_biasgrad_has_dropout_maxk_96.cpp │ │ ├── fmha_grouped_backward_bf16_has_mask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp │ │ ├── fmha_grouped_backward_bf16_has_mask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp │ │ ├── fmha_grouped_backward_bf16_has_mask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp │ │ ├── fmha_grouped_backward_bf16_has_mask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp │ │ ├── fmha_grouped_backward_bf16_has_mask_no_bias_no_biasgrad_no_dropout_maxk_96.cpp │ │ ├── fmha_grouped_backward_bf16_instances_ref.h │ │ ├── fmha_grouped_backward_bf16_no_mask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp │ │ ├── fmha_grouped_backward_bf16_no_mask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp │ │ ├── fmha_grouped_backward_bf16_no_mask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp │ │ ├── fmha_grouped_backward_bf16_no_mask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp │ │ ├── fmha_grouped_backward_bf16_no_mask_has_bias_has_biasgrad_has_dropout_maxk_96.cpp │ │ ├── fmha_grouped_backward_bf16_no_mask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp │ │ ├── fmha_grouped_backward_bf16_no_mask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp │ │ ├── fmha_grouped_backward_bf16_no_mask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp │ │ ├── fmha_grouped_backward_bf16_no_mask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp │ │ ├── fmha_grouped_backward_bf16_no_mask_has_bias_has_biasgrad_no_dropout_maxk_96.cpp │ │ ├── fmha_grouped_backward_bf16_no_mask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp │ │ ├── fmha_grouped_backward_bf16_no_mask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp │ │ ├── fmha_grouped_backward_bf16_no_mask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp │ │ ├── fmha_grouped_backward_bf16_no_mask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp │ │ ├── fmha_grouped_backward_bf16_no_mask_has_bias_no_biasgrad_has_dropout_maxk_96.cpp │ │ ├── fmha_grouped_backward_bf16_no_mask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp │ │ ├── fmha_grouped_backward_bf16_no_mask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp │ │ ├── fmha_grouped_backward_bf16_no_mask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp │ │ ├── fmha_grouped_backward_bf16_no_mask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp │ │ ├── fmha_grouped_backward_bf16_no_mask_has_bias_no_biasgrad_no_dropout_maxk_96.cpp │ │ ├── fmha_grouped_backward_bf16_no_mask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp │ │ ├── fmha_grouped_backward_bf16_no_mask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp │ │ ├── fmha_grouped_backward_bf16_no_mask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp │ │ ├── fmha_grouped_backward_bf16_no_mask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp │ │ ├── fmha_grouped_backward_bf16_no_mask_no_bias_no_biasgrad_has_dropout_maxk_96.cpp │ │ ├── fmha_grouped_backward_bf16_no_mask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp │ │ ├── fmha_grouped_backward_bf16_no_mask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp │ │ ├── fmha_grouped_backward_bf16_no_mask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp │ │ ├── fmha_grouped_backward_bf16_no_mask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp │ │ ├── fmha_grouped_backward_bf16_no_mask_no_bias_no_biasgrad_no_dropout_maxk_96.cpp │ │ ├── fmha_grouped_backward_fp16_has_mask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp │ │ ├── fmha_grouped_backward_fp16_has_mask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp │ │ ├── fmha_grouped_backward_fp16_has_mask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp │ │ ├── fmha_grouped_backward_fp16_has_mask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp │ │ ├── fmha_grouped_backward_fp16_has_mask_has_bias_has_biasgrad_has_dropout_maxk_96.cpp │ │ ├── fmha_grouped_backward_fp16_has_mask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp │ │ ├── fmha_grouped_backward_fp16_has_mask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp │ │ ├── fmha_grouped_backward_fp16_has_mask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp │ │ ├── fmha_grouped_backward_fp16_has_mask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp │ │ ├── fmha_grouped_backward_fp16_has_mask_has_bias_has_biasgrad_no_dropout_maxk_96.cpp │ │ ├── fmha_grouped_backward_fp16_has_mask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp │ │ ├── fmha_grouped_backward_fp16_has_mask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp │ │ ├── fmha_grouped_backward_fp16_has_mask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp │ │ ├── fmha_grouped_backward_fp16_has_mask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp │ │ ├── fmha_grouped_backward_fp16_has_mask_has_bias_no_biasgrad_has_dropout_maxk_96.cpp │ │ ├── fmha_grouped_backward_fp16_has_mask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp │ │ ├── fmha_grouped_backward_fp16_has_mask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp │ │ ├── fmha_grouped_backward_fp16_has_mask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp │ │ ├── fmha_grouped_backward_fp16_has_mask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp │ │ ├── fmha_grouped_backward_fp16_has_mask_has_bias_no_biasgrad_no_dropout_maxk_96.cpp │ │ ├── fmha_grouped_backward_fp16_has_mask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp │ │ ├── fmha_grouped_backward_fp16_has_mask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp │ │ ├── fmha_grouped_backward_fp16_has_mask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp │ │ ├── fmha_grouped_backward_fp16_has_mask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp │ │ ├── fmha_grouped_backward_fp16_has_mask_no_bias_no_biasgrad_has_dropout_maxk_96.cpp │ │ ├── fmha_grouped_backward_fp16_has_mask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp │ │ ├── fmha_grouped_backward_fp16_has_mask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp │ │ ├── fmha_grouped_backward_fp16_has_mask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp │ │ ├── fmha_grouped_backward_fp16_has_mask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp │ │ ├── fmha_grouped_backward_fp16_has_mask_no_bias_no_biasgrad_no_dropout_maxk_96.cpp │ │ ├── fmha_grouped_backward_fp16_instances_ref.h │ │ ├── fmha_grouped_backward_fp16_no_mask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp │ │ ├── fmha_grouped_backward_fp16_no_mask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp │ │ ├── fmha_grouped_backward_fp16_no_mask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp │ │ ├── fmha_grouped_backward_fp16_no_mask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp │ │ ├── fmha_grouped_backward_fp16_no_mask_has_bias_has_biasgrad_has_dropout_maxk_96.cpp │ │ ├── fmha_grouped_backward_fp16_no_mask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp │ │ ├── fmha_grouped_backward_fp16_no_mask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp │ │ ├── fmha_grouped_backward_fp16_no_mask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp │ │ ├── fmha_grouped_backward_fp16_no_mask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp │ │ ├── fmha_grouped_backward_fp16_no_mask_has_bias_has_biasgrad_no_dropout_maxk_96.cpp │ │ ├── fmha_grouped_backward_fp16_no_mask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp │ │ ├── fmha_grouped_backward_fp16_no_mask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp │ │ ├── fmha_grouped_backward_fp16_no_mask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp │ │ ├── fmha_grouped_backward_fp16_no_mask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp │ │ ├── fmha_grouped_backward_fp16_no_mask_has_bias_no_biasgrad_has_dropout_maxk_96.cpp │ │ ├── fmha_grouped_backward_fp16_no_mask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp │ │ ├── fmha_grouped_backward_fp16_no_mask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp │ │ ├── fmha_grouped_backward_fp16_no_mask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp │ │ ├── fmha_grouped_backward_fp16_no_mask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp │ │ ├── fmha_grouped_backward_fp16_no_mask_has_bias_no_biasgrad_no_dropout_maxk_96.cpp │ │ ├── fmha_grouped_backward_fp16_no_mask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp │ │ ├── fmha_grouped_backward_fp16_no_mask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp │ │ ├── fmha_grouped_backward_fp16_no_mask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp │ │ ├── fmha_grouped_backward_fp16_no_mask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp │ │ ├── fmha_grouped_backward_fp16_no_mask_no_bias_no_biasgrad_has_dropout_maxk_96.cpp │ │ ├── fmha_grouped_backward_fp16_no_mask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp │ │ ├── fmha_grouped_backward_fp16_no_mask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp │ │ ├── fmha_grouped_backward_fp16_no_mask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp │ │ ├── fmha_grouped_backward_fp16_no_mask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp │ │ ├── fmha_grouped_backward_fp16_no_mask_no_bias_no_biasgrad_no_dropout_maxk_96.cpp │ │ ├── fmha_grouped_forward_bf16_has_mask_has_bias_has_dropout_maxk_128.cpp │ │ ├── fmha_grouped_forward_bf16_has_mask_has_bias_has_dropout_maxk_256.cpp │ │ ├── fmha_grouped_forward_bf16_has_mask_has_bias_has_dropout_maxk_32.cpp │ │ ├── fmha_grouped_forward_bf16_has_mask_has_bias_has_dropout_maxk_512.cpp │ │ ├── fmha_grouped_forward_bf16_has_mask_has_bias_has_dropout_maxk_64.cpp │ │ ├── fmha_grouped_forward_bf16_has_mask_has_bias_has_dropout_maxk_96.cpp │ │ ├── fmha_grouped_forward_bf16_has_mask_has_bias_no_dropout_maxk_128.cpp │ │ ├── fmha_grouped_forward_bf16_has_mask_has_bias_no_dropout_maxk_256.cpp │ │ ├── fmha_grouped_forward_bf16_has_mask_has_bias_no_dropout_maxk_32.cpp │ │ ├── fmha_grouped_forward_bf16_has_mask_has_bias_no_dropout_maxk_512.cpp │ │ ├── fmha_grouped_forward_bf16_has_mask_has_bias_no_dropout_maxk_64.cpp │ │ ├── fmha_grouped_forward_bf16_has_mask_has_bias_no_dropout_maxk_96.cpp │ │ ├── fmha_grouped_forward_bf16_has_mask_no_bias_has_dropout_maxk_128.cpp │ │ ├── fmha_grouped_forward_bf16_has_mask_no_bias_has_dropout_maxk_256.cpp │ │ ├── fmha_grouped_forward_bf16_has_mask_no_bias_has_dropout_maxk_32.cpp │ │ ├── fmha_grouped_forward_bf16_has_mask_no_bias_has_dropout_maxk_512.cpp │ │ ├── fmha_grouped_forward_bf16_has_mask_no_bias_has_dropout_maxk_64.cpp │ │ ├── fmha_grouped_forward_bf16_has_mask_no_bias_has_dropout_maxk_96.cpp │ │ ├── fmha_grouped_forward_bf16_has_mask_no_bias_no_dropout_maxk_128.cpp │ │ ├── fmha_grouped_forward_bf16_has_mask_no_bias_no_dropout_maxk_256.cpp │ │ ├── fmha_grouped_forward_bf16_has_mask_no_bias_no_dropout_maxk_32.cpp │ │ ├── fmha_grouped_forward_bf16_has_mask_no_bias_no_dropout_maxk_512.cpp │ │ ├── fmha_grouped_forward_bf16_has_mask_no_bias_no_dropout_maxk_64.cpp │ │ ├── fmha_grouped_forward_bf16_has_mask_no_bias_no_dropout_maxk_96.cpp │ │ ├── fmha_grouped_forward_bf16_instances_ref.h │ │ ├── fmha_grouped_forward_bf16_no_mask_has_bias_has_dropout_maxk_128.cpp │ │ ├── fmha_grouped_forward_bf16_no_mask_has_bias_has_dropout_maxk_256.cpp │ │ ├── fmha_grouped_forward_bf16_no_mask_has_bias_has_dropout_maxk_32.cpp │ │ ├── fmha_grouped_forward_bf16_no_mask_has_bias_has_dropout_maxk_512.cpp │ │ ├── fmha_grouped_forward_bf16_no_mask_has_bias_has_dropout_maxk_64.cpp │ │ ├── fmha_grouped_forward_bf16_no_mask_has_bias_has_dropout_maxk_96.cpp │ │ ├── fmha_grouped_forward_bf16_no_mask_has_bias_no_dropout_maxk_128.cpp │ │ ├── fmha_grouped_forward_bf16_no_mask_has_bias_no_dropout_maxk_256.cpp │ │ ├── fmha_grouped_forward_bf16_no_mask_has_bias_no_dropout_maxk_32.cpp │ │ ├── fmha_grouped_forward_bf16_no_mask_has_bias_no_dropout_maxk_512.cpp │ │ ├── fmha_grouped_forward_bf16_no_mask_has_bias_no_dropout_maxk_64.cpp │ │ ├── fmha_grouped_forward_bf16_no_mask_has_bias_no_dropout_maxk_96.cpp │ │ ├── fmha_grouped_forward_bf16_no_mask_no_bias_has_dropout_maxk_128.cpp │ │ ├── fmha_grouped_forward_bf16_no_mask_no_bias_has_dropout_maxk_256.cpp │ │ ├── fmha_grouped_forward_bf16_no_mask_no_bias_has_dropout_maxk_32.cpp │ │ ├── fmha_grouped_forward_bf16_no_mask_no_bias_has_dropout_maxk_512.cpp │ │ ├── fmha_grouped_forward_bf16_no_mask_no_bias_has_dropout_maxk_64.cpp │ │ ├── fmha_grouped_forward_bf16_no_mask_no_bias_has_dropout_maxk_96.cpp │ │ ├── fmha_grouped_forward_bf16_no_mask_no_bias_no_dropout_maxk_128.cpp │ │ ├── fmha_grouped_forward_bf16_no_mask_no_bias_no_dropout_maxk_256.cpp │ │ ├── fmha_grouped_forward_bf16_no_mask_no_bias_no_dropout_maxk_32.cpp │ │ ├── fmha_grouped_forward_bf16_no_mask_no_bias_no_dropout_maxk_512.cpp │ │ ├── fmha_grouped_forward_bf16_no_mask_no_bias_no_dropout_maxk_64.cpp │ │ ├── fmha_grouped_forward_bf16_no_mask_no_bias_no_dropout_maxk_96.cpp │ │ ├── fmha_grouped_forward_fp16_has_mask_has_bias_has_dropout_maxk_128.cpp │ │ ├── fmha_grouped_forward_fp16_has_mask_has_bias_has_dropout_maxk_256.cpp │ │ ├── fmha_grouped_forward_fp16_has_mask_has_bias_has_dropout_maxk_32.cpp │ │ ├── fmha_grouped_forward_fp16_has_mask_has_bias_has_dropout_maxk_512.cpp │ │ ├── fmha_grouped_forward_fp16_has_mask_has_bias_has_dropout_maxk_64.cpp │ │ ├── fmha_grouped_forward_fp16_has_mask_has_bias_has_dropout_maxk_96.cpp │ │ ├── fmha_grouped_forward_fp16_has_mask_has_bias_no_dropout_maxk_128.cpp │ │ ├── fmha_grouped_forward_fp16_has_mask_has_bias_no_dropout_maxk_256.cpp │ │ ├── fmha_grouped_forward_fp16_has_mask_has_bias_no_dropout_maxk_32.cpp │ │ ├── fmha_grouped_forward_fp16_has_mask_has_bias_no_dropout_maxk_512.cpp │ │ ├── fmha_grouped_forward_fp16_has_mask_has_bias_no_dropout_maxk_64.cpp │ │ ├── fmha_grouped_forward_fp16_has_mask_has_bias_no_dropout_maxk_96.cpp │ │ ├── fmha_grouped_forward_fp16_has_mask_no_bias_has_dropout_maxk_128.cpp │ │ ├── fmha_grouped_forward_fp16_has_mask_no_bias_has_dropout_maxk_256.cpp │ │ ├── fmha_grouped_forward_fp16_has_mask_no_bias_has_dropout_maxk_32.cpp │ │ ├── fmha_grouped_forward_fp16_has_mask_no_bias_has_dropout_maxk_512.cpp │ │ ├── fmha_grouped_forward_fp16_has_mask_no_bias_has_dropout_maxk_64.cpp │ │ ├── fmha_grouped_forward_fp16_has_mask_no_bias_has_dropout_maxk_96.cpp │ │ ├── fmha_grouped_forward_fp16_has_mask_no_bias_no_dropout_maxk_128.cpp │ │ ├── fmha_grouped_forward_fp16_has_mask_no_bias_no_dropout_maxk_256.cpp │ │ ├── fmha_grouped_forward_fp16_has_mask_no_bias_no_dropout_maxk_32.cpp │ │ ├── fmha_grouped_forward_fp16_has_mask_no_bias_no_dropout_maxk_512.cpp │ │ ├── fmha_grouped_forward_fp16_has_mask_no_bias_no_dropout_maxk_64.cpp │ │ ├── fmha_grouped_forward_fp16_has_mask_no_bias_no_dropout_maxk_96.cpp │ │ ├── fmha_grouped_forward_fp16_instances_ref.h │ │ ├── fmha_grouped_forward_fp16_no_mask_has_bias_has_dropout_maxk_128.cpp │ │ ├── fmha_grouped_forward_fp16_no_mask_has_bias_has_dropout_maxk_256.cpp │ │ ├── fmha_grouped_forward_fp16_no_mask_has_bias_has_dropout_maxk_32.cpp │ │ ├── fmha_grouped_forward_fp16_no_mask_has_bias_has_dropout_maxk_512.cpp │ │ ├── fmha_grouped_forward_fp16_no_mask_has_bias_has_dropout_maxk_64.cpp │ │ ├── fmha_grouped_forward_fp16_no_mask_has_bias_has_dropout_maxk_96.cpp │ │ ├── fmha_grouped_forward_fp16_no_mask_has_bias_no_dropout_maxk_128.cpp │ │ ├── fmha_grouped_forward_fp16_no_mask_has_bias_no_dropout_maxk_256.cpp │ │ ├── fmha_grouped_forward_fp16_no_mask_has_bias_no_dropout_maxk_32.cpp │ │ ├── fmha_grouped_forward_fp16_no_mask_has_bias_no_dropout_maxk_512.cpp │ │ ├── fmha_grouped_forward_fp16_no_mask_has_bias_no_dropout_maxk_64.cpp │ │ ├── fmha_grouped_forward_fp16_no_mask_has_bias_no_dropout_maxk_96.cpp │ │ ├── fmha_grouped_forward_fp16_no_mask_no_bias_has_dropout_maxk_128.cpp │ │ ├── fmha_grouped_forward_fp16_no_mask_no_bias_has_dropout_maxk_256.cpp │ │ ├── fmha_grouped_forward_fp16_no_mask_no_bias_has_dropout_maxk_32.cpp │ │ ├── fmha_grouped_forward_fp16_no_mask_no_bias_has_dropout_maxk_512.cpp │ │ ├── fmha_grouped_forward_fp16_no_mask_no_bias_has_dropout_maxk_64.cpp │ │ ├── fmha_grouped_forward_fp16_no_mask_no_bias_has_dropout_maxk_96.cpp │ │ ├── fmha_grouped_forward_fp16_no_mask_no_bias_no_dropout_maxk_128.cpp │ │ ├── fmha_grouped_forward_fp16_no_mask_no_bias_no_dropout_maxk_256.cpp │ │ ├── fmha_grouped_forward_fp16_no_mask_no_bias_no_dropout_maxk_32.cpp │ │ ├── fmha_grouped_forward_fp16_no_mask_no_bias_no_dropout_maxk_512.cpp │ │ ├── fmha_grouped_forward_fp16_no_mask_no_bias_no_dropout_maxk_64.cpp │ │ ├── fmha_grouped_forward_fp16_no_mask_no_bias_no_dropout_maxk_96.cpp │ │ ├── fmha_grouped_infer_bf16_has_mask_has_bias_has_dropout_maxk_128.cpp │ │ ├── fmha_grouped_infer_bf16_has_mask_has_bias_has_dropout_maxk_256.cpp │ │ ├── fmha_grouped_infer_bf16_has_mask_has_bias_has_dropout_maxk_32.cpp │ │ ├── fmha_grouped_infer_bf16_has_mask_has_bias_has_dropout_maxk_512.cpp │ │ ├── fmha_grouped_infer_bf16_has_mask_has_bias_has_dropout_maxk_64.cpp │ │ ├── fmha_grouped_infer_bf16_has_mask_has_bias_has_dropout_maxk_96.cpp │ │ ├── fmha_grouped_infer_bf16_has_mask_has_bias_no_dropout_maxk_128.cpp │ │ ├── fmha_grouped_infer_bf16_has_mask_has_bias_no_dropout_maxk_256.cpp │ │ ├── fmha_grouped_infer_bf16_has_mask_has_bias_no_dropout_maxk_32.cpp │ │ ├── fmha_grouped_infer_bf16_has_mask_has_bias_no_dropout_maxk_512.cpp │ │ ├── fmha_grouped_infer_bf16_has_mask_has_bias_no_dropout_maxk_64.cpp │ │ ├── fmha_grouped_infer_bf16_has_mask_has_bias_no_dropout_maxk_96.cpp │ │ ├── fmha_grouped_infer_bf16_has_mask_no_bias_has_dropout_maxk_128.cpp │ │ ├── fmha_grouped_infer_bf16_has_mask_no_bias_has_dropout_maxk_256.cpp │ │ ├── fmha_grouped_infer_bf16_has_mask_no_bias_has_dropout_maxk_32.cpp │ │ ├── fmha_grouped_infer_bf16_has_mask_no_bias_has_dropout_maxk_512.cpp │ │ ├── fmha_grouped_infer_bf16_has_mask_no_bias_has_dropout_maxk_64.cpp │ │ ├── fmha_grouped_infer_bf16_has_mask_no_bias_has_dropout_maxk_96.cpp │ │ ├── fmha_grouped_infer_bf16_has_mask_no_bias_no_dropout_maxk_128.cpp │ │ ├── fmha_grouped_infer_bf16_has_mask_no_bias_no_dropout_maxk_256.cpp │ │ ├── fmha_grouped_infer_bf16_has_mask_no_bias_no_dropout_maxk_32.cpp │ │ ├── fmha_grouped_infer_bf16_has_mask_no_bias_no_dropout_maxk_512.cpp │ │ ├── fmha_grouped_infer_bf16_has_mask_no_bias_no_dropout_maxk_64.cpp │ │ ├── fmha_grouped_infer_bf16_has_mask_no_bias_no_dropout_maxk_96.cpp │ │ ├── fmha_grouped_infer_bf16_instances_ref.h │ │ ├── fmha_grouped_infer_bf16_no_mask_has_bias_has_dropout_maxk_128.cpp │ │ ├── fmha_grouped_infer_bf16_no_mask_has_bias_has_dropout_maxk_256.cpp │ │ ├── fmha_grouped_infer_bf16_no_mask_has_bias_has_dropout_maxk_32.cpp │ │ ├── fmha_grouped_infer_bf16_no_mask_has_bias_has_dropout_maxk_512.cpp │ │ ├── fmha_grouped_infer_bf16_no_mask_has_bias_has_dropout_maxk_64.cpp │ │ ├── fmha_grouped_infer_bf16_no_mask_has_bias_has_dropout_maxk_96.cpp │ │ ├── fmha_grouped_infer_bf16_no_mask_has_bias_no_dropout_maxk_128.cpp │ │ ├── fmha_grouped_infer_bf16_no_mask_has_bias_no_dropout_maxk_256.cpp │ │ ├── fmha_grouped_infer_bf16_no_mask_has_bias_no_dropout_maxk_32.cpp │ │ ├── fmha_grouped_infer_bf16_no_mask_has_bias_no_dropout_maxk_512.cpp │ │ ├── fmha_grouped_infer_bf16_no_mask_has_bias_no_dropout_maxk_64.cpp │ │ ├── fmha_grouped_infer_bf16_no_mask_has_bias_no_dropout_maxk_96.cpp │ │ ├── fmha_grouped_infer_bf16_no_mask_no_bias_has_dropout_maxk_128.cpp │ │ ├── fmha_grouped_infer_bf16_no_mask_no_bias_has_dropout_maxk_256.cpp │ │ ├── fmha_grouped_infer_bf16_no_mask_no_bias_has_dropout_maxk_32.cpp │ │ ├── fmha_grouped_infer_bf16_no_mask_no_bias_has_dropout_maxk_512.cpp │ │ ├── fmha_grouped_infer_bf16_no_mask_no_bias_has_dropout_maxk_64.cpp │ │ ├── fmha_grouped_infer_bf16_no_mask_no_bias_has_dropout_maxk_96.cpp │ │ ├── fmha_grouped_infer_bf16_no_mask_no_bias_no_dropout_maxk_128.cpp │ │ ├── fmha_grouped_infer_bf16_no_mask_no_bias_no_dropout_maxk_256.cpp │ │ ├── fmha_grouped_infer_bf16_no_mask_no_bias_no_dropout_maxk_32.cpp │ │ ├── fmha_grouped_infer_bf16_no_mask_no_bias_no_dropout_maxk_512.cpp │ │ ├── fmha_grouped_infer_bf16_no_mask_no_bias_no_dropout_maxk_64.cpp │ │ ├── fmha_grouped_infer_bf16_no_mask_no_bias_no_dropout_maxk_96.cpp │ │ ├── fmha_grouped_infer_fp16_has_mask_has_bias_has_dropout_maxk_128.cpp │ │ ├── fmha_grouped_infer_fp16_has_mask_has_bias_has_dropout_maxk_256.cpp │ │ ├── fmha_grouped_infer_fp16_has_mask_has_bias_has_dropout_maxk_32.cpp │ │ ├── fmha_grouped_infer_fp16_has_mask_has_bias_has_dropout_maxk_512.cpp │ │ ├── fmha_grouped_infer_fp16_has_mask_has_bias_has_dropout_maxk_64.cpp │ │ ├── fmha_grouped_infer_fp16_has_mask_has_bias_has_dropout_maxk_96.cpp │ │ ├── fmha_grouped_infer_fp16_has_mask_has_bias_no_dropout_maxk_128.cpp │ │ ├── fmha_grouped_infer_fp16_has_mask_has_bias_no_dropout_maxk_256.cpp │ │ ├── fmha_grouped_infer_fp16_has_mask_has_bias_no_dropout_maxk_32.cpp │ │ ├── fmha_grouped_infer_fp16_has_mask_has_bias_no_dropout_maxk_512.cpp │ │ ├── fmha_grouped_infer_fp16_has_mask_has_bias_no_dropout_maxk_64.cpp │ │ ├── fmha_grouped_infer_fp16_has_mask_has_bias_no_dropout_maxk_96.cpp │ │ ├── fmha_grouped_infer_fp16_has_mask_no_bias_has_dropout_maxk_128.cpp │ │ ├── fmha_grouped_infer_fp16_has_mask_no_bias_has_dropout_maxk_256.cpp │ │ ├── fmha_grouped_infer_fp16_has_mask_no_bias_has_dropout_maxk_32.cpp │ │ ├── fmha_grouped_infer_fp16_has_mask_no_bias_has_dropout_maxk_512.cpp │ │ ├── fmha_grouped_infer_fp16_has_mask_no_bias_has_dropout_maxk_64.cpp │ │ ├── fmha_grouped_infer_fp16_has_mask_no_bias_has_dropout_maxk_96.cpp │ │ ├── fmha_grouped_infer_fp16_has_mask_no_bias_no_dropout_maxk_128.cpp │ │ ├── fmha_grouped_infer_fp16_has_mask_no_bias_no_dropout_maxk_256.cpp │ │ ├── fmha_grouped_infer_fp16_has_mask_no_bias_no_dropout_maxk_32.cpp │ │ ├── fmha_grouped_infer_fp16_has_mask_no_bias_no_dropout_maxk_512.cpp │ │ ├── fmha_grouped_infer_fp16_has_mask_no_bias_no_dropout_maxk_64.cpp │ │ ├── fmha_grouped_infer_fp16_has_mask_no_bias_no_dropout_maxk_96.cpp │ │ ├── fmha_grouped_infer_fp16_instances_ref.h │ │ ├── fmha_grouped_infer_fp16_no_mask_has_bias_has_dropout_maxk_128.cpp │ │ ├── fmha_grouped_infer_fp16_no_mask_has_bias_has_dropout_maxk_256.cpp │ │ ├── fmha_grouped_infer_fp16_no_mask_has_bias_has_dropout_maxk_32.cpp │ │ ├── fmha_grouped_infer_fp16_no_mask_has_bias_has_dropout_maxk_512.cpp │ │ ├── fmha_grouped_infer_fp16_no_mask_has_bias_has_dropout_maxk_64.cpp │ │ ├── fmha_grouped_infer_fp16_no_mask_has_bias_has_dropout_maxk_96.cpp │ │ ├── fmha_grouped_infer_fp16_no_mask_has_bias_no_dropout_maxk_128.cpp │ │ ├── fmha_grouped_infer_fp16_no_mask_has_bias_no_dropout_maxk_256.cpp │ │ ├── fmha_grouped_infer_fp16_no_mask_has_bias_no_dropout_maxk_32.cpp │ │ ├── fmha_grouped_infer_fp16_no_mask_has_bias_no_dropout_maxk_512.cpp │ │ ├── fmha_grouped_infer_fp16_no_mask_has_bias_no_dropout_maxk_64.cpp │ │ ├── fmha_grouped_infer_fp16_no_mask_has_bias_no_dropout_maxk_96.cpp │ │ ├── fmha_grouped_infer_fp16_no_mask_no_bias_has_dropout_maxk_128.cpp │ │ ├── fmha_grouped_infer_fp16_no_mask_no_bias_has_dropout_maxk_256.cpp │ │ ├── fmha_grouped_infer_fp16_no_mask_no_bias_has_dropout_maxk_32.cpp │ │ ├── fmha_grouped_infer_fp16_no_mask_no_bias_has_dropout_maxk_512.cpp │ │ ├── fmha_grouped_infer_fp16_no_mask_no_bias_has_dropout_maxk_64.cpp │ │ ├── fmha_grouped_infer_fp16_no_mask_no_bias_has_dropout_maxk_96.cpp │ │ ├── fmha_grouped_infer_fp16_no_mask_no_bias_no_dropout_maxk_128.cpp │ │ ├── fmha_grouped_infer_fp16_no_mask_no_bias_no_dropout_maxk_256.cpp │ │ ├── fmha_grouped_infer_fp16_no_mask_no_bias_no_dropout_maxk_32.cpp │ │ ├── fmha_grouped_infer_fp16_no_mask_no_bias_no_dropout_maxk_512.cpp │ │ ├── fmha_grouped_infer_fp16_no_mask_no_bias_no_dropout_maxk_64.cpp │ │ └── fmha_grouped_infer_fp16_no_mask_no_bias_no_dropout_maxk_96.cpp │ ├── nvcc_info.cu │ ├── pt_stable_utils.cu │ ├── pt_stable_utils.h │ └── sparse24/ │ ├── compute_sparse_tile.h │ ├── gemm.cu │ ├── meta_utils.cu │ ├── sparse24.cpp │ ├── sparse24_apply.cu │ ├── sparse24_apply_dense_output.cu │ ├── sparse24_gemm_sm90.cu │ ├── sparse24_largest_mask_2d.cu │ ├── sparse24_metadata.h │ ├── sparse24_pack.cu │ ├── sparse24_pack.h │ ├── sparse24_pack_test.cu │ ├── sparseNM_dense.cu │ ├── static_sort.h │ └── warp_tensor.h ├── flash_attn_3/ │ └── __init__.py ├── fwbw_overlap.py ├── info.py ├── ops/ │ ├── __init__.py │ ├── _triton/ │ │ ├── __init__.py │ │ ├── k_index_select_cat.py │ │ ├── k_scaled_index_add.py │ │ ├── matmul_perf_model.py │ │ ├── rmsnorm_kernels.py │ │ ├── rope_padded_kernels.py │ │ └── tiled_matmul_kernels.py │ ├── common.py │ ├── differentiable_collectives.py │ ├── fmha/ │ │ ├── __init__.py │ │ ├── _triton/ │ │ │ ├── __init__.py │ │ │ └── splitk_kernels.py │ │ ├── attn_bias.py │ │ ├── ck.py │ │ ├── ck_splitk.py │ │ ├── common.py │ │ ├── cutlass.py │ │ ├── cutlass_blackwell.py │ │ ├── dispatch.py │ │ ├── flash.py │ │ ├── flash3.py │ │ ├── merge_training.py │ │ ├── torch_attention_compat.py │ │ └── triton_splitk.py │ ├── indexing.py │ ├── modpar_layers.py │ ├── rmsnorm.py │ ├── rope_padded.py │ ├── seqpar.py │ ├── sequence_parallel_fused_ops.py │ ├── sp24.py │ ├── swiglu_op.py │ ├── tiled_matmul.py │ ├── tree_attention.py │ └── unbind.py ├── profiler/ │ ├── __init__.py │ ├── api.py │ ├── device_limits.py │ ├── find_slowest.py │ ├── profile_analyzer.py │ ├── profiler.py │ ├── profiler_dcgm.py │ └── profiler_dcgm_impl.py ├── sparse/ │ ├── __init__.py │ ├── blocksparse_tensor.py │ └── utils.py ├── test.py ├── triton/ │ ├── __init__.py │ ├── importing.py │ └── vararg_kernel.py └── utils.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .clang-format ================================================ --- AccessModifierOffset: -1 AlignAfterOpenBracket: AlwaysBreak AlignConsecutiveAssignments: false AlignConsecutiveDeclarations: false AlignEscapedNewlinesLeft: true AlignOperands: false AlignTrailingComments: false AllowAllParametersOfDeclarationOnNextLine: false AllowShortBlocksOnASingleLine: false AllowShortCaseLabelsOnASingleLine: false AllowShortFunctionsOnASingleLine: Empty AllowShortIfStatementsOnASingleLine: false AllowShortLoopsOnASingleLine: false AlwaysBreakAfterReturnType: None AlwaysBreakBeforeMultilineStrings: true AlwaysBreakTemplateDeclarations: true BinPackArguments: false BinPackParameters: false BraceWrapping: AfterClass: false AfterControlStatement: false AfterEnum: false AfterFunction: false AfterNamespace: false AfterObjCDeclaration: false AfterStruct: false AfterUnion: false BeforeCatch: false BeforeElse: false IndentBraces: false BreakBeforeBinaryOperators: None BreakBeforeBraces: Attach BreakBeforeTernaryOperators: true BreakConstructorInitializersBeforeComma: false BreakAfterJavaFieldAnnotations: false BreakStringLiterals: false ColumnLimit: 80 CommentPragmas: '^ IWYU pragma:' #CompactNamespaces: false ConstructorInitializerAllOnOneLineOrOnePerLine: true ConstructorInitializerIndentWidth: 4 ContinuationIndentWidth: 4 Cpp11BracedListStyle: true DerivePointerAlignment: false DisableFormat: false ForEachMacros: [ FOR_EACH_RANGE, FOR_EACH, ] IncludeCategories: - Regex: '^<.*\.h(pp)?>' Priority: 1 - Regex: '^<.*' Priority: 2 - Regex: '.*' Priority: 3 IndentCaseLabels: true IndentWidth: 2 IndentWrappedFunctionNames: false KeepEmptyLinesAtTheStartOfBlocks: false MacroBlockBegin: '' MacroBlockEnd: '' MaxEmptyLinesToKeep: 1 NamespaceIndentation: None ObjCBlockIndentWidth: 2 ObjCSpaceAfterProperty: false ObjCSpaceBeforeProtocolList: false PenaltyBreakBeforeFirstCallParameter: 1 PenaltyBreakComment: 300 PenaltyBreakFirstLessLess: 120 PenaltyBreakString: 1000 PenaltyExcessCharacter: 1000000 PenaltyReturnTypeOnItsOwnLine: 2000000 PointerAlignment: Left ReflowComments: true SortIncludes: true SpaceAfterCStyleCast: false SpaceBeforeAssignmentOperators: true SpaceBeforeParens: ControlStatements SpaceInEmptyParentheses: false SpacesBeforeTrailingComments: 1 SpacesInAngles: false SpacesInContainerLiterals: true SpacesInCStyleCastParentheses: false SpacesInParentheses: false SpacesInSquareBrackets: false Standard: Cpp11 TabWidth: 8 UseTab: Never ... ================================================ FILE: .coveragerc ================================================ [run] omit = docs/* tests/* setup.py xformers/benchmarks/* xformers/triton/k_* stubs/* third_party/* ================================================ FILE: .editorconfig ================================================ root = true [*.py] charset = utf-8 trim_trailing_whitespace = true end_of_line = lf insert_final_newline = true indent_style = space indent_size = 4 [*.md] trim_trailing_whitespace = false ================================================ FILE: .flake8 ================================================ [flake8] exclude = .git ,.github/run-clang-format.py ,third_party max-line-length = 140 copyright-check = True select = E,F,W,C copyright-regexp=Copyright \(c\) Facebook, Inc. and its affiliates. All Rights Reserved ignore=W503,E203,E704 ================================================ FILE: .github/ISSUE_TEMPLATE/bug-report.md ================================================ --- name: "\U0001F41B Bug Report" about: Submit a bug report to help us improve xFormers --- # 🐛 Bug ## Command ## To Reproduce Steps to reproduce the behavior: 1. 2. 3. ## Expected behavior ## Environment Please copy and paste the output from the environment collection script from PyTorch (or fill out the checklist below manually). You can run the script with: ```bash # For security purposes, please check the contents of collect_env.py before running it. python -m torch.utils.collect_env ``` - PyTorch Version (e.g., 1.0): - OS (e.g., Linux): - How you installed PyTorch (`conda`, `pip`, source): - Build command you used (if compiling from source): - Python version: - CUDA/cuDNN version: - GPU models and configuration: - Any other relevant information: ## Additional context ================================================ FILE: .github/ISSUE_TEMPLATE/feature-request.md ================================================ --- name: "\U0001F680Feature Request" about: Submit a proposal/request for a new xFormers feature --- # 🚀 Feature ## Motivation ## Pitch ## Alternatives ## Additional context ================================================ FILE: .github/ISSUE_TEMPLATE/questions-help-support.md ================================================ --- name: "❓Questions/Help/Support" about: Do you need support? --- # ❓ Questions and Help ================================================ FILE: .github/PULL_REQUEST_TEMPLATE.md ================================================ ## What does this PR do? Fixes # (issue). ## Before submitting - [ ] Did you have fun? - Make sure you had fun coding 🙃 - [ ] Did you read the [contributor guideline](https://github.com/facebookresearch/xformers/blob/master/CONTRIBUTING.md)? - [ ] Was this discussed/approved via a Github issue? (no need for typos, doc improvements) - [ ] N/A - [ ] Did you make sure to update the docs? - [ ] N/A - [ ] Did you write any new necessary tests? - [ ] N/A - [ ] Did you update the [changelog](https://github.com/facebookresearch/xformers/blob/master/CHANGELOG.md)? (if needed) - [ ] N/A ## PR review Anyone in the community is free to review the PR once the tests have passed. If we didn't discuss your PR in Github issues there's a high chance it will not be merged. ================================================ FILE: .github/actions/setup-build-cuda/action.yml ================================================ name: Set up Runner for build inputs: toolkit_type: description: cuda or rocm type: string toolkit_short_version: required: true type: string description: "Example: 117 for 11.7" python: description: Python version to install type: string default: "3.10" runs: using: composite steps: - id: cuda_info shell: python3 "{0}" run: | import os import sys print(sys.version) cushort = "${{ inputs.toolkit_short_version }}" # Version uploaded to pypi (rather than PyTorch s3) TORCH_CUDA_DEFAULT = "128" # since pytorch 2.9.0 # https://github.com/Jimver/cuda-toolkit/blob/master/src/links/linux-links.ts full_version, install_script = { "130": ("13.0.1", "https://developer.download.nvidia.com/compute/cuda/13.0.1/local_installers/cuda_13.0.1_580.82.07_linux.run"), "129": ("12.9.1", "https://developer.download.nvidia.com/compute/cuda/12.9.1/local_installers/cuda_12.9.1_575.57.08_linux.run"), "128": ("12.8.1", "https://developer.download.nvidia.com/compute/cuda/12.8.1/local_installers/cuda_12.8.1_570.124.06_linux.run"), # (Build with nvcc 12.8 on linux even when building for 12.6 to avoid seg fault in Flash3 build) "126": ("12.8.1", "https://developer.download.nvidia.com/compute/cuda/12.8.1/local_installers/cuda_12.8.1_570.124.06_linux.run"), "118": ("11.8.0", "https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run"), "6.0": ("6.0.2", "https://repo.radeon.com/amdgpu-install/6.0.2/rhel/8.9/amdgpu-install-6.0.60002-1.el8.noarch.rpm"), "6.1": ("6.1.3", "https://repo.radeon.com/amdgpu-install/6.1.3/rhel/8.9/amdgpu-install-6.1.60103-1.el8.noarch.rpm"), "6.2.4": ("6.2.4", "https://repo.radeon.com/amdgpu-install/6.2.4/rhel/8.9/amdgpu-install-6.2.60204-1.el8.noarch.rpm"), "6.3": ("6.3.1", "https://repo.radeon.com/amdgpu-install/6.3.1/rhel/8.9/amdgpu-install-6.3.60301-1.el8.noarch.rpm"), "6.4": ("6.4.2", "https://repo.radeon.com/amdgpu-install/6.4.2/rhel/8.9/amdgpu-install-6.4.60402-1.el8.noarch.rpm"), "7.0": ("7.0.3", "https://repo.radeon.com/amdgpu-install/7.0.3/rhel/8/amdgpu-install-7.0.3.70003-1.el8.noarch.rpm"), "7.1": ("7.1.0", "https://repo.radeon.com/amdgpu-install/7.1/rhel/8/amdgpu-install-7.1.70100-1.el8.noarch.rpm"), }[cushort] with open(os.environ['GITHUB_OUTPUT'], "r+") as fp: fp.write("CUDA_VERSION=" + full_version + "\n") if cushort == TORCH_CUDA_DEFAULT: fp.write("CUDA_VERSION_SUFFIX=\n") else: fp.write("CUDA_VERSION_SUFFIX=+" + ("cu" if "cuda" == "${{ inputs.toolkit_type }}" else "rocm") + cushort + "\n") fp.write("CUDA_INSTALL_SCRIPT=" + install_script + "\n") - run: echo "CUDA_VERSION_SUFFIX=${{ steps.cuda_info.outputs.CUDA_VERSION_SUFFIX }}" >> ${GITHUB_ENV} shell: bash # WINDOWS STEPS - name: Install cuda if: runner.os == 'Windows' && inputs.toolkit_type == 'cuda' id: cuda-toolkit # Using N-Storm fork until https://github.com/Jimver/cuda-toolkit/issues/395 is resolved uses: N-Storm/cuda-toolkit@v0.2.28 with: cuda: ${{ steps.cuda_info.outputs.CUDA_VERSION }} method: network - if: runner.os == 'Windows' && inputs.toolkit_type == 'cuda' shell: bash run: | echo "Installed cuda version is: ${{ steps.cuda-toolkit.outputs.cuda }}" echo "Cuda install location: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}" echo "CUDA_HOME=${{ steps.cuda-toolkit.outputs.CUDA_PATH }}" >> ${GITHUB_ENV} cat ${GITHUB_ENV} - name: Install python if: runner.os == 'Windows' uses: actions/setup-python@v4 with: python-version: ${{ inputs.python }} - name: Setup MSVC if: runner.os == 'Windows' uses: ilammy/msvc-dev-cmd@v1 # really unfortunate: https://github.com/ilammy/msvc-dev-cmd#name-conflicts-with-shell-bash - name: Remove link.exe if: runner.os == 'Windows' shell: bash run: rm /usr/bin/link # LINUX STEPS - if: ${{ runner.os == 'Linux' && !(contains(inputs.toolkit_type, 'cuda') && fromJSON(inputs.toolkit_short_version) > 124) }} shell: bash run: | # Use GCC11 for ROCM / cu118 / cu124 yum list installed yum install gcc-toolset-11-gcc gcc-toolset-11-gcc-c++ gcc-toolset-11-libstdc++-devel wget git -y echo "source /opt/rh/gcc-toolset-11/enable" >> ~/.profile - if: ${{ runner.os == 'Linux' && contains(inputs.toolkit_type, 'cuda') && fromJSON(inputs.toolkit_short_version) > 124 }} shell: bash run: | # Use GCC13 for cu126+ yum list installed yum install gcc-toolset-13-gcc gcc-toolset-13-gcc-c++ gcc-toolset-13-libstdc++-devel wget git -y echo "source /opt/rh/gcc-toolset-13/enable" >> ~/.profile - if: runner.os == 'Linux' shell: bash -l {0} run: | yum list installed yum install wget git -y which g++ g++ --version - if: runner.os == 'Linux' && contains(inputs.toolkit_type, 'cuda') name: (Linux) install cuda shell: bash -l {0} run: | wget -q "${{ steps.cuda_info.outputs.CUDA_INSTALL_SCRIPT }}" -O cuda.run && \ sh ./cuda.run --silent --toolkit && \ rm ./cuda.run echo "CUDA_HOME=/usr/local/cuda" >> ${GITHUB_ENV} - if: runner.os == 'Linux' && contains(inputs.toolkit_type, 'cuda') name: (Linux) print cuda setup info shell: bash -l {0} run: | echo "CUDA_HOME=$CUDA_HOME" echo "###############################" echo "############ NVCC ############" echo "###############################" $CUDA_HOME/bin/nvcc --version md5sum $CUDA_HOME/bin/nvcc echo "###############################" echo "############ PTXAS ############" echo "###############################" $CUDA_HOME/bin/ptxas --version md5sum $CUDA_HOME/bin/ptxas - if: runner.os == 'Linux' && contains(inputs.toolkit_type, 'rocm') name: (Linux) install rocm shell: bash run: | yum install -y libzstd yum install -y ${{ steps.cuda_info.outputs.CUDA_INSTALL_SCRIPT }} amdgpu-install -y --usecase=rocm --no-dkms echo "ROCM_PATH=/opt/rocm" >> ${GITHUB_ENV} echo "PATH=$PATH:/opt/rocm/bin" >> ${GITHUB_ENV} echo "MAX_JOBS=16" >> ${GITHUB_ENV} # host compiler is too new for cuda 12.1 :( - run: echo "NVCC_FLAGS=-allow-unsupported-compiler" >> $GITHUB_ENV shell: bash ================================================ FILE: .github/actions/setup-env-build/action.yml ================================================ name: Install env + build inputs: arch: description: 'GPU architecture' required: true python: description: 'Python version' required: false default: "3.11" runs: using: composite steps: - name: Cleanup shell: bash run: rm -f ~/.profile ~/.bashrc - id: prepare_conda_env_paths shell: python run: | import os import subprocess import hashlib import glob import datetime from pathlib import Path CONDA_INSTALL_CMD = "micromamba create python=${{ inputs.python }} zlib pip ninja ccache=4.8 -c conda-forge -q -y" conda_env_key = CONDA_INSTALL_CMD + "[cu130][v2]" for file in sorted(glob.glob("requirement*.txt")): conda_env_key += f"\n########## {file}\n" conda_env_key += Path(file).read_text() env_name_key = hashlib.sha224(conda_env_key.encode("ascii")).hexdigest()[:8] env_name_key += "-${{ inputs.arch }}" # Nightly or Test, update every week env_name_key += "-"+datetime.date.today().strftime("%Y-week%W") shared_dir = os.environ.get("GHRUNNER_SHARED_DIR", os.getcwd()) env_path = os.path.join(shared_dir, "tmp", "${{ inputs.arch }}", os.environ["GITHUB_RUN_ID"]) final_env = Path(shared_dir) / f"env_{env_name_key}.txt" pkg_dir = Path(shared_dir) / "pkgs-sm${{ inputs.arch }}" (Path(shared_dir) / f"env_{env_name_key}_content.txt").write_text(conda_env_key) CONDA_INSTALL_CMD += " -p " + env_path env_already_built = False # If environment is already built if final_env.is_file(): final_env_link = final_env.read_text().strip() if (Path(final_env_link) / "bin" / "python").is_file(): print("Found valid env - skipping env setup") CONDA_INSTALL_CMD = "true" env_already_built = True env_path = final_env_link else: print("Invalid env") with open(os.environ['GITHUB_ENV'], "r+") as fp: fp.write("CONDA_ENV_LINK=" + str(final_env) + "\n") fp.write("CONDA_PREFIX=" + env_path + "\n") fp.write("CONDA_PKGS_DIRS=" + str(pkg_dir) + "\n") fp.write("CONDA_INSTALL_CMD=" + CONDA_INSTALL_CMD + "\n") fp.write("CONDA_ENV_HASH=" + env_name_key + "\n") fp.write("PY=" + os.path.join(env_path, "bin", "python") + "\n") fp.write("PIP=" + os.path.join(env_path, "bin", "pip") + "\n") with open(os.environ['GITHUB_OUTPUT'], "r+") as fp: fp.write(f"ENV_CACHED={int(env_already_built)}\n") - name: Print conda commands shell: bash -l {0} run: | echo "CONDA_PREFIX=$CONDA_PREFIX" echo "CONDA_INSTALL_CMD=$CONDA_INSTALL_CMD" echo "CONDA_ENV_HASH=$CONDA_ENV_HASH" echo "PY=$PY" - name: Install micromamba shell: bash -l {0} run: | set -ex curl -Ls https://micro.mamba.pm/api/micromamba/linux-64/latest -o micromamba.tar.bz2 tar --extract --verbose --bzip2 --file=micromamba.tar.bz2 bin/micromamba echo "eval \"\$($(pwd)/bin/micromamba shell hook --shell bash)\"" >> ~/.profile - name: Conda/pip setup shell: bash -l {0} if: steps.prepare_conda_env_paths.outputs.ENV_CACHED == 0 run: | set -ex micromamba config set channel_priority strict # Retry if failed after removing downloaded packages cache $CONDA_INSTALL_CMD || (rm -rf $CONDA_PKGS_DIRS && rm -rf $CONDA_PREFIX && $CONDA_INSTALL_CMD) $PY -m pip install cmake $PY -m pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128 $PY -m pip install -r requirements-benchmark.txt --progress-bar off - name: Activate environment shell: bash -l {0} run: | echo "micromamba activate $CONDA_PREFIX" >> ~/.profile echo "==== .profile =====" cat ~/.profile - run: which python shell: bash -l {0} - name: Setup ccache nvcc shell: bash -l {0} if: steps.prepare_conda_env_paths.outputs.ENV_CACHED == 0 run: | echo "#!/bin/bash" > $CONDA_PREFIX/bin/nvcc-ccache echo "ccache nvcc \"\$@\"" >> $CONDA_PREFIX/bin/nvcc-ccache cat $CONDA_PREFIX/bin/nvcc-ccache chmod +x $CONDA_PREFIX/bin/nvcc-ccache which nvcc ccache --version - name: Setup ccache g++ shell: bash -l {0} if: steps.prepare_conda_env_paths.outputs.ENV_CACHED == 0 run: | echo "#!/bin/bash" > $CONDA_PREFIX/bin/g++-ccache echo "ccache g++ \"\$@\"" >> $CONDA_PREFIX/bin/g++-ccache cat $CONDA_PREFIX/bin/g++-ccache chmod +x $CONDA_PREFIX/bin/g++-ccache which g++-ccache - name: Patch for https://github.com/pytorch/pytorch/issues/114962 shell: bash -l {0} run: | CPP_EXTENSIONS_PY=$(python -c "import torch.utils.cpp_extension; print(torch.utils.cpp_extension.__file__)") echo "Patching $CPP_EXTENSIONS_PY" sed -i "/generate-dependencies-with-compile/d" $CPP_EXTENSIONS_PY - name: Check NVIDIA libs shell: bash -l {0} run: | ldconfig -p | grep libcuda.so ls /.singularity.d/libs/ - name: Mark env as ready shell: bash -l {0} if: steps.prepare_conda_env_paths.outputs.ENV_CACHED == 0 run: echo $CONDA_PREFIX > $CONDA_ENV_LINK - name: Setup ccache shell: bash -l {0} run: | export CCACHE_DIR=$GHRUNNER_SHARED_DIR/ccache echo "CCACHE_DIR=$CCACHE_DIR" >> ${GITHUB_ENV} mkdir -p $CCACHE_DIR ccache -s - name: Build shell: bash -l {0} run: | PYTORCH_NVCC="$CONDA_PREFIX/bin/nvcc-ccache" CXX="g++-ccache" TORCH_CUDA_ARCH_LIST=${{ inputs.arch }} python -m pip install -v --no-build-isolation -e . - name: Check for PyTorch stable symbols shell: bash -l {0} run: | bad_symbols=$(nm --dynamic --undefined-only --demangle xformers/_C.so | grep --extended-regexp "(torch|at|c10|c10d)::" || true) if [[ $bad_symbols != "" ]]; then echo "These non-stable PyTorch symbols made it into the xFormers shared library:"; echo $bad_symbols; exit 1; fi - name: Build info run: | printenv python -m xformers.info python xformers/_triton_version_fairinternal.py ccache -s shell: bash -l {0} ================================================ FILE: .github/compute_wheel_version.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. # # This source code is licensed under the BSD license found in the # LICENSE file in the root directory of this source tree. import argparse import subprocess from pathlib import Path from typing import Optional # TODO: consolidate with the code in build_conda.py THIS_PATH = Path(__file__).resolve() version_from_file = (THIS_PATH.parents[1] / "version.txt").read_text().strip() def get_tagged_version() -> Optional[str]: """ Return whether we are at an exact version (namely the version variable). """ try: tag = subprocess.check_output( ["git", "describe", "--tags", "--exact-match", "HEAD"], text=True, stderr=subprocess.DEVNULL, ).strip() except subprocess.CalledProcessError: # no tag return None if not tag.startswith("v"): return None return tag[1:] def get_dev_version() -> str: assert ".dev" not in version_from_file num_commits = subprocess.check_output( ["git", "rev-list", "--count", "HEAD"], text=True ).strip() return f"{version_from_file}.dev{num_commits}" if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--source", choices=["tag", "dev", "tag,dev"], required=False, default="tag,dev" ) args = parser.parse_args() if "tag" in args.source: tagged_version = get_tagged_version() if args.source == "tag" and tagged_version is None: raise ValueError("No tag found") else: tagged_version = None if tagged_version is not None: print(tagged_version, end="") else: print(get_dev_version(), end="") ================================================ FILE: .github/gpu_benchmark_diff.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. # # This source code is licensed under the BSD license found in the # LICENSE file in the root directory of this source tree. import glob import os import subprocess import xformers.benchmarks.utils as utils class NamedObject: def __init__(self, name) -> None: self.__name__ = name def git_file_at(filename: str, ref: str) -> str: try: return subprocess.check_output( ["git", "show", f"{ref}:{filename}"], text=True ).strip() except subprocess.CalledProcessError: return "" # File does not exist in that revision GITHUB_BASE_REF = subprocess.check_output( ["git", "rev-parse", "origin/" + os.environ["GITHUB_BASE_REF"]], text=True ).strip() XFORMERS_BENCHMARKS_CACHE = os.environ["XFORMERS_BENCHMARKS_CACHE"] GITHUB_CURRENT_REF = subprocess.check_output( ["git", "rev-parse", "HEAD"], text=True ).strip() for f in glob.glob(os.path.join(XFORMERS_BENCHMARKS_CACHE, "*", "*.csv")): before = git_file_at(f, ref=GITHUB_BASE_REF) now = git_file_at(f, ref=GITHUB_CURRENT_REF) if before == "" or before == now: continue benchmark_name = os.path.basename(os.path.dirname(f)) print("#" * 100) print(f"# UPDATED: {f}") print("#" * 100) filename_before = f.replace("reference", "before") filename_now = f.replace("reference", "now") with open(filename_before, "w+") as fd: fd.write(before) with open(filename_now, "w+") as fd: fd.write(now) utils.benchmark_run_and_compare( benchmark_fn=NamedObject(benchmark_name), cases=[], compare=[ os.path.basename(filename_before)[: -len(".csv")], os.path.basename(filename_now)[: -len(".csv")], ], ) ================================================ FILE: .github/run-clang-format.py ================================================ #!/usr/bin/env python3 """ MIT License Copyright (c) 2017 Guillaume Papin Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ """A wrapper script around clang-format, suitable for linting multiple files and to use for continuous integration. This is an alternative API for the clang-format command line. It runs over multiple files and directories in parallel. A diff output is produced and a sensible exit code is returned. """ import argparse # noqa: E402 import difflib # noqa: E402 import fnmatch # noqa: E402 import io # noqa: E402 import multiprocessing # noqa: E402 import os # noqa: E402 import signal # noqa: E402 import subprocess # noqa: E402 import sys # noqa: E402 import traceback # noqa: E402 from functools import partial # noqa: E402 from subprocess import DEVNULL # noqa: E402 DEFAULT_EXTENSIONS = "c,h,C,H,cpp,hpp,cc,hh,c++,h++,cxx,hxx,cu" class ExitStatus: SUCCESS = 0 DIFF = 1 TROUBLE = 2 def list_files(files, recursive=False, extensions=None, exclude=None): if extensions is None: extensions = [] if exclude is None: exclude = [] out = [] for file in files: if recursive and os.path.isdir(file): for dirpath, dnames, fnames in os.walk(file): fpaths = [os.path.join(dirpath, fname) for fname in fnames] for pattern in exclude: # os.walk() supports trimming down the dnames list # by modifying it in-place, # to avoid unnecessary directory listings. dnames[:] = [ x for x in dnames if not fnmatch.fnmatch(os.path.join(dirpath, x), pattern) ] fpaths = [x for x in fpaths if not fnmatch.fnmatch(x, pattern)] for f in fpaths: ext = os.path.splitext(f)[1][1:] if ext in extensions: out.append(f) else: out.append(file) return out def make_diff(file, original, reformatted): return list( difflib.unified_diff( original, reformatted, fromfile="a/{}\t(original)".format(file), tofile="b/{}\t(reformatted)".format(file), n=3, ) ) class DiffError(Exception): def __init__(self, message, errs=None): super(DiffError, self).__init__(message) self.errs = errs or [] class UnexpectedError(Exception): def __init__(self, message, exc=None): super(UnexpectedError, self).__init__(message) self.formatted_traceback = traceback.format_exc() self.exc = exc def run_clang_format_diff_wrapper(args, file): try: ret = run_clang_format_diff(args, file) return ret except DiffError: raise except Exception as e: raise UnexpectedError("{}: {}: {}".format(file, e.__class__.__name__, e), e) def run_clang_format_diff(args, file): try: with io.open(file, "r", encoding="utf-8") as f: original = f.readlines() except IOError as exc: raise DiffError(str(exc)) invocation = [args.clang_format_executable, file] # Use of utf-8 to decode the process output. # # Hopefully, this is the correct thing to do. # # It's done due to the following assumptions (which may be incorrect): # - clang-format will returns the bytes read from the files as-is, # without conversion, and it is already assumed that the files use utf-8. # - if the diagnostics were internationalized, they would use utf-8: # > Adding Translations to Clang # > # > Not possible yet! # > Diagnostic strings should be written in UTF-8, # > the client can translate to the relevant code page if needed. # > Each translation completely replaces the format string # > for the diagnostic. # > -- http://clang.llvm.org/docs/InternalsManual.html#internals-diag-translation try: proc = subprocess.Popen( invocation, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, encoding="utf-8", ) except OSError as exc: raise DiffError( "Command '{}' failed to start: {}".format( subprocess.list2cmdline(invocation), exc ) ) proc_stdout = proc.stdout proc_stderr = proc.stderr # hopefully the stderr pipe won't get full and block the process outs = list(proc_stdout.readlines()) errs = list(proc_stderr.readlines()) proc.wait() if proc.returncode: raise DiffError( "Command '{}' returned non-zero exit status {}".format( subprocess.list2cmdline(invocation), proc.returncode ), errs, ) return make_diff(file, original, outs), errs def bold_red(s): return "\x1b[1m\x1b[31m" + s + "\x1b[0m" def colorize(diff_lines): def bold(s): return "\x1b[1m" + s + "\x1b[0m" def cyan(s): return "\x1b[36m" + s + "\x1b[0m" def green(s): return "\x1b[32m" + s + "\x1b[0m" def red(s): return "\x1b[31m" + s + "\x1b[0m" for line in diff_lines: if line[:4] in ["--- ", "+++ "]: yield bold(line) elif line.startswith("@@ "): yield cyan(line) elif line.startswith("+"): yield green(line) elif line.startswith("-"): yield red(line) else: yield line def print_diff(diff_lines, use_color): if use_color: diff_lines = colorize(diff_lines) sys.stdout.writelines(diff_lines) def print_trouble(prog, message, use_colors): error_text = "error:" if use_colors: error_text = bold_red(error_text) print("{}: {} {}".format(prog, error_text, message), file=sys.stderr) def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( "--clang-format-executable", metavar="EXECUTABLE", help="path to the clang-format executable", default="clang-format", ) parser.add_argument( "--extensions", help="comma separated list of file extensions (default: {})".format( DEFAULT_EXTENSIONS ), default=DEFAULT_EXTENSIONS, ) parser.add_argument( "-r", "--recursive", action="store_true", help="run recursively over directories", ) parser.add_argument("files", metavar="file", nargs="+") parser.add_argument("-q", "--quiet", action="store_true") parser.add_argument( "-j", metavar="N", type=int, default=0, help="run N clang-format jobs in parallel" " (default number of cpus + 1)", ) parser.add_argument( "--color", default="auto", choices=["auto", "always", "never"], help="show colored diff (default: auto)", ) parser.add_argument( "-e", "--exclude", metavar="PATTERN", action="append", default=[], help="exclude paths matching the given glob-like pattern(s)" " from recursive search", ) args = parser.parse_args() # use default signal handling, like diff return SIGINT value on ^C # https://bugs.python.org/issue14229#msg156446 signal.signal(signal.SIGINT, signal.SIG_DFL) try: signal.SIGPIPE except AttributeError: # compatibility, SIGPIPE does not exist on Windows pass else: signal.signal(signal.SIGPIPE, signal.SIG_DFL) colored_stdout = False colored_stderr = False if args.color == "always": colored_stdout = True colored_stderr = True elif args.color == "auto": colored_stdout = sys.stdout.isatty() colored_stderr = sys.stderr.isatty() version_invocation = [args.clang_format_executable, str("--version")] try: subprocess.check_call(version_invocation, stdout=DEVNULL) except subprocess.CalledProcessError as e: print_trouble(parser.prog, str(e), use_colors=colored_stderr) return ExitStatus.TROUBLE except OSError as e: print_trouble( parser.prog, "Command '{}' failed to start: {}".format( subprocess.list2cmdline(version_invocation), e ), use_colors=colored_stderr, ) return ExitStatus.TROUBLE retcode = ExitStatus.SUCCESS files = list_files( args.files, recursive=args.recursive, exclude=args.exclude, extensions=args.extensions.split(","), ) if not files: return njobs = args.j if njobs == 0: njobs = multiprocessing.cpu_count() + 1 njobs = min(len(files), njobs) if njobs == 1: # execute directly instead of in a pool, # less overhead, simpler stacktraces it = (run_clang_format_diff_wrapper(args, file) for file in files) pool = None else: pool = multiprocessing.Pool(njobs) it = pool.imap_unordered(partial(run_clang_format_diff_wrapper, args), files) while True: try: outs, errs = next(it) except StopIteration: break except DiffError as e: print_trouble(parser.prog, str(e), use_colors=colored_stderr) retcode = ExitStatus.TROUBLE sys.stderr.writelines(e.errs) except UnexpectedError as e: print_trouble(parser.prog, str(e), use_colors=colored_stderr) sys.stderr.write(e.formatted_traceback) retcode = ExitStatus.TROUBLE # stop at the first unexpected error, # something could be very wrong, # don't process all files unnecessarily if pool: pool.terminate() break else: sys.stderr.writelines(errs) if outs == []: continue if not args.quiet: print_diff(outs, use_color=colored_stdout) if retcode == ExitStatus.SUCCESS: retcode = ExitStatus.DIFF return retcode if __name__ == "__main__": sys.exit(main()) ================================================ FILE: .github/run_benchmark_wrapper.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. # # This source code is licensed under the BSD license found in the # LICENSE file in the root directory of this source tree. import glob import os import shlex import subprocess import sys import torch import xformers # Build failed - return early if not xformers._has_cpp_library: print("xFormers wasn't built correctly - can't run benchmarks") sys.exit(0) benchmark_script = os.path.join("xformers", "benchmarks", sys.argv[1]) benchmark_fn = sys.argv[2] label = subprocess.check_output(["git", "rev-parse", "HEAD"], text=True).strip()[:8] cmd = [ sys.executable, benchmark_script, "--label", label, "--fn", benchmark_fn, "--fail_if_regression", "--quiet", ] env = ( torch.cuda.get_device_name(torch.cuda.current_device()) .replace(" ", "_") .replace("-", "_") .replace(".", "_") ) # Figure out the name of the baseline pattern = os.path.join(os.environ["XFORMERS_BENCHMARKS_CACHE"], benchmark_fn, "*.csv") ref_names = glob.glob(pattern) baseline_names = set( os.path.basename(s)[: -len(".csv")] for s in ref_names # Only compare to benchmark data on same hardware if env in os.path.basename(s) ) if baseline_names: if len(baseline_names) > 1: raise RuntimeError( f"Supplied more than one reference for this benchmark: {','.join(baseline_names)}" ) cmd += ["--compare", ",".join(baseline_names)] print("EXEC:", shlex.join(cmd)) retcode = 0 try: subprocess.check_call(cmd) except subprocess.CalledProcessError as e: retcode = e.returncode # Remove original benchmark files for f in ref_names: os.remove(f) # Rename new ones as 'ref' for f in glob.glob(pattern): os.rename(f, f.replace(label, "reference")) sys.exit(retcode) ================================================ FILE: .github/selective_ci/requirements.txt ================================================ GitPython ================================================ FILE: .github/selective_ci/selective_ci.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. # # This source code is licensed under the BSD license found in the # LICENSE file in the root directory of this source tree. import argparse import fnmatch import os from dataclasses import dataclass, field from pathlib import Path import git @dataclass class ComponentInfo: """ A component is deemed to have changed if any of its files or dependencies have changed. If it has not changed, its files will be removed. """ name: str # These files will be deleted if the component is not enabled files: list[str] dependencies: list[str] disable_set_env: dict[str, str] = field(default_factory=dict) COMMON_PATTERNS = [ # All components will be tested if something in there changes "setup.py", ] COMPONENTS = [ ComponentInfo( name="attention", files=[ "tests/test_mem_eff_attention.py", "tests/test_find_sparse_locations*.py", "tests/test_block_sparse_mem_eff_attention*.py", "tests/test_attention_patterns.py", "tests/test_rope_padded.py", "tests/test_tree_attention*.py", "tests/test_fmha*.py", ], dependencies=[ "xformers/ops/fmha/*", "third_party/cutlass", "third_party/composable_kernel_tiled", "xformers/csrc/attention/*", "xformers/triton/*", ], disable_set_env={ "XFORMERS_DISABLE_FLASH_ATTN": "1", }, ), ComponentInfo( name="sp24", files=[ "tests/test_sparsity24.py", "xformers/csrc/sparse24/*", ], dependencies=[ "xformers/ops/sp24.py", ], ), ComponentInfo( name="sequence_parallel_fused", files=[ "tests/test_seqpar.py", "tests/test_sequence_parallel_fused_ops.py", "tests/test_tiled_matmul.py", ], dependencies=[ "tests/multiprocessing_utils.py", "xformers/ops/sequence_parallel_fused_ops.py", ], ), ] repo_root_path = Path(__file__).parent.parent.parent.resolve().absolute() repo = git.Repo(repo_root_path) def list_files_in_commit(commit: git.Commit): file_list = [] stack = [commit.tree] while len(stack) > 0: tree = stack.pop() # enumerate blobs (files) at this level for b in tree.blobs: file_list.append(str(Path(b.path).absolute().relative_to(repo_root_path))) for subtree in tree.trees: stack.append(subtree) # you can return dir_list if you want directories too return file_list def check_patterns_are_valid(patterns): # Only check patterns in `fairinternal` repo if os.environ.get("GITHUB_REPOSITORY", "") != "fairinternal/xformers": return found_patterns = set() for f in all_files: for pattern in patterns: if fnmatch.fnmatch(f, pattern): found_patterns.add(pattern) for pattern in patterns: if pattern not in found_patterns: assert False, f"Pattern does not match any file: `{pattern}`" parser = argparse.ArgumentParser("xFormers selective CI") parser.add_argument("--base_commit", default="origin/main") args = parser.parse_args() base_commit = repo.rev_parse(args.base_commit) all_files = list_files_in_commit(repo.head.commit) + [sm.path for sm in repo.submodules] all_modified_files = set() for item in repo.head.commit.diff(base_commit): if item.a_path is not None: all_modified_files.add(item.a_path) if item.b_path is not None: all_modified_files.add(item.b_path) check_patterns_are_valid(COMMON_PATTERNS) for component in COMPONENTS: # Sanity check that all files exist check_patterns_are_valid(component.files + component.dependencies) # Check if module is updated skip_module = True for pattern in COMMON_PATTERNS + component.files + component.dependencies: for f in all_modified_files: if fnmatch.fnmatch(f, pattern): skip_module = False break print(component.name, "SKIP" if skip_module else "TEST") if not skip_module: continue # Delete component files for f in all_files: for pattern in component.files: if fnmatch.fnmatch(f, pattern): if Path(f).exists(): Path(f).unlink() # Set env variable for env_k, env_v in component.disable_set_env.items(): if "GITHUB_ENV" not in os.environ: print(f"{env_k}={env_v}") continue with open(os.environ["GITHUB_ENV"], "a") as fd: fd.write(f"{env_k}={env_v}\n") ================================================ FILE: .github/workflows/gh-pages.yml ================================================ name: Build & deploy documentation on: push: branches: - main pull_request: jobs: deploy: runs-on: ubuntu-24.04 concurrency: group: ${{ github.workflow }}-${{ github.ref }} steps: - uses: actions/checkout@v2 - name: Setup Python uses: actions/setup-python@v2 with: python-version: '3.9' - name: Upgrade pip run: | # install pip=>20.1 to use "pip cache dir" python3 -m pip install --upgrade pip - name: Get pip cache dir id: pip-cache run: echo "::set-output name=dir::$(pip cache dir)" - name: Cache dependencies uses: actions/cache@v4 with: path: ${{ steps.pip-cache.outputs.dir }} key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} restore-keys: | ${{ runner.os }}-pip- - name: Build docs run: | cd docs pip install --progress-bar off -r requirements.txt make help make html - name: Deploy uses: peaceiris/actions-gh-pages@v3 with: github_token: ${{ secrets.GITHUB_TOKEN }} publish_dir: docs/build/html if: github.event_name != 'pull_request' ================================================ FILE: .github/workflows/gpu_test_gh.yml ================================================ name: gpu_test_gh on: workflow_dispatch: {} pull_request: paths: - "xformers/**" - "!xformers/benchmarks/**" - "!xformers/version.txt" - ".github/workflows/gpu_test_gh*" - "tests/**" - "setup.py" - "requirements*.txt" - "third_party/**" push: branches: - main env: XFORMERS_BUILD_TYPE: "Release" CI: "1" TORCHINDUCTOR_COMPILE_THREADS: "1" jobs: gpu_test_gh: strategy: fail-fast: false matrix: gpu: - runner: "h100-256GB" sm: "9.0a" - runner: "4-core-ubuntu-gpu-t4" sm: "7.5" python: [3.11] name: test_sm${{ matrix.gpu.sm }} runs-on: ${{ matrix.gpu.runner }} timeout-minutes: 360 defaults: run: shell: bash -l {0} steps: - name: Recursive checkout uses: actions/checkout@v3 with: submodules: recursive path: "." fetch-depth: 0 # We need commits history as well - run: nvidia-smi - name: Install micromamba run: | set -ex curl -Ls https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj bin/micromamba echo "eval \"\$($(pwd)/bin/micromamba shell hook --shell bash)\"" >> ~/.profile cat ~/.profile - name: Create environment run: | set -ex micromamba config set channel_priority strict micromamba create -n env python=${{ matrix.python }} \ zlib pip ninja ccache=4.8 cuda-toolkit \ -c "nvidia/label/cuda-12.6" -c conda-forge -q -y - name: Activate environment shell: bash -l {0} run: | echo "micromamba activate env" >> ~/.profile echo "==== .profile =====" cat ~/.profile - name: Selective build/tests if: github.event_name == 'pull_request' run: | pip install -r .github/selective_ci/requirements.txt python .github/selective_ci/selective_ci.py --base_commit ${{ github.event.pull_request.base.sha }} - name: Setup test requirements run: | which python which nvcc pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu126 pip install --pre flash_attn_3 --index-url https://download.pytorch.org/whl/cu126 pip install -r requirements-test.txt --progress-bar off - run: TORCH_CUDA_ARCH_LIST=${{ matrix.gpu.sm }} python -m pip install -v --no-build-isolation -e . env: TORCH_DONT_CHECK_COMPILER_ABI: 1 - run: python -m xformers.info - name: xFormers import should not init cuda context run: | # NOTE: we check GPU version by default to determine if triton should be used # and this initializes CUDA context, unless we set `XFORMERS_ENABLE_TRITON` XFORMERS_ENABLE_TRITON=1 python -c "import xformers; import xformers.ops; import torch; assert not torch.cuda.is_initialized()" - name: Check for PyTorch stable symbols run: | bad_symbols=$(nm --dynamic --undefined-only --demangle xformers/_C.so | grep --extended-regexp "(torch|at|c10|c10d)::" || true) if [[ $bad_symbols != "" ]]; then echo "These non-stable PyTorch symbols made it into the xFormers shared library:"; echo $bad_symbols; exit 1; fi - name: Unit tests run: | python -m pytest --verbose --random-order-bucket=global --maxfail=20 --junitxml=test-results/junit.xml --cov-report=xml --cov=./ tests - name: Publish Test Report uses: mikepenz/action-junit-report@v3 if: success() || failure() # always run even if the previous step fails with: report_paths: 'test-results/*.xml' ================================================ FILE: .github/workflows/linters.yml ================================================ on: pull_request: {} push: branches: - main jobs: repo: uses: ./.github/workflows/linters_reusable.yml ================================================ FILE: .github/workflows/linters_reusable.yml ================================================ name: lint on: workflow_call: inputs: pre-script: type: string jobs: linters: runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v3 with: fetch-depth: 0 - name: Setup Python uses: actions/setup-python@v5 with: python-version: '3.10' - name: Cleanup host run: | # Github's ubuntu-latest comes with a ton of stuff; # https://carlosbecker.com/posts/github-actions-disk-space suggests # this hotfix: df -h sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL sudo docker image prune --all --force sudo docker builder prune -a df -h - name: Run pre-script if: ${{ inputs.pre-script }} run: ${{ inputs.pre-script }} # Triton is too slow to install, and beside it's not needed - run: sed -i '/triton/d' requirements-test.txt - name: Install deps run: pip install -r requirements-test.txt - name: ufmt if: success() || failure() run: ufmt check - name: mypy if: success() || failure() run: | python -m mypy --version python -m mypy --ignore-missing-imports --scripts-are-modules --pretty --exclude "(build|stubs|third_party|docs|examples|setup.py)" . - name: flake8 if: success() || failure() run: python -m flake8 --config .flake8 --show-source --statistics - name: clang-format if: success() || failure() run: | pip install clang-format clang-format --version # apply to our files - excluding autogenerated files ./.github/run-clang-format.py -e "*fmha/autogen" -r xformers/csrc - name: PyTorch stable API includes if: success() || failure() run: | bad_files=$(git grep --extended-regex -e "#\s*include\s*<.*(torch|ATen|c10)" --and --not -e "#\s*include\s*= 120 && fromJSON(inputs.toolkit_short_version) < 130 run: | echo "TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST 8.0 9.0a" >> ${GITHUB_ENV} - if: contains(inputs.toolkit_type, 'cuda') && fromJSON(inputs.toolkit_short_version) >= 130 run: | echo "TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST 8.0 9.0a 10.0a 10.3a 11.0a 12.0a 12.1a" >> ${GITHUB_ENV} - if: runner.os == 'Windows' run: git config --system core.longpaths true - name: Recursive checkout uses: actions/checkout@v4 with: submodules: recursive path: "." fetch-depth: 0 # for tags - name: HACKFIX for cutlass compiler bug if: runner.os == 'Windows' run: | # See https://github.com/NVIDIA/cutlass/issues/1732 rm -f third_party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp touch third_party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp - name: Setup Runner uses: ./.github/actions/setup-build-cuda with: toolkit_type: ${{ inputs.toolkit_type }} toolkit_short_version: ${{ inputs.toolkit_short_version }} python: ${{ inputs.python }} - if: runner.os == 'Linux' run: printenv - if: runner.os != 'Windows' name: (Linux) Setup venv for linux shell: bash -l {0} run: | $PY -m venv venv . ./venv/bin/activate which pip echo "PY=$(which python)" >> ${GITHUB_ENV} echo "PATH=$PATH" >> ${GITHUB_ENV} git config --global --add safe.directory "*" pip install packaging ninja wheel setuptools twine - name: Define version id: xformers_version env: VERSION_SOURCE: ${{ github.ref_type == 'tag' && 'tag' || 'dev' }} run: | set -Eeuo pipefail git config --global --add safe.directory "*" version=`python .github/compute_wheel_version.py --source $VERSION_SOURCE` echo $version > version.txt echo "BUILD_VERSION=$version${{ steps.cuda_info.outputs.CUDA_VERSION_SUFFIX }}" >> ${GITHUB_ENV} echo "BUILD_VERSION=$version${{ steps.cuda_info.outputs.CUDA_VERSION_SUFFIX }}" >> ${GITHUB_OUTPUT} which ninja ninja --version cat ${GITHUB_ENV} - run: echo "xformers-${BUILD_VERSION}" - run: echo "release version (will upload to PyTorch)" if: ${{ !contains(steps.xformers_version.outputs.BUILD_VERSION, '.dev') }} - name: Install corresponding PyTorch run: | PYTORCH_INDEX_URL="https://download.pytorch.org/whl/${{ contains(inputs.toolkit_type, 'cuda') && 'cu' || 'rocm' }}${{ inputs.toolkit_short_version }}" $PY -m pip install wheel -r requirements.txt --extra-index-url $PYTORCH_INDEX_URL - name: Build wheel shell: bash -l {0} run: | $PY setup.py bdist_wheel -d dist/ -k $PLAT_ARG env: PLAT_ARG: ${{ contains(inputs.os, 'ubuntu') && '--plat-name manylinux_2_28_x86_64' || '' }} - run: du -h dist/* - uses: actions/upload-artifact@v4 with: name: ${{ inputs.os }}-py${{ inputs.python }}-torch${{ inputs.torch_version }}+${{ contains(inputs.toolkit_type, 'cuda') && 'cu' || 'rocm' }}${{ inputs.toolkit_short_version }}_${{ inputs.artifact_tag }} path: dist/*.whl # Note: it might be helpful to have additional steps that test if the built wheels actually work ================================================ FILE: .github/workflows/wheels_upload_pip.yml ================================================ name: wheels_upload_pip on: workflow_call: secrets: twine_password: required: true inputs: twine_username: required: true type: string pypirc: required: false type: string filter: required: true type: string description: Filter which runs to upload. Example '*+cu121*' execute: required: true type: boolean description: Actually upload the wheels. Dry-run if false artifact_tag: default: "facebookresearch" type: string env: TWINE_USERNAME: __token__ jobs: wheels_upload_pip: name: wheels_upload_pip runs-on: ubuntu-24.04 timeout-minutes: 360 defaults: run: shell: bash steps: - name: Recursive checkout uses: actions/checkout@v3 with: submodules: recursive path: "." fetch-depth: 0 # for tags # inspired by https://github.com/jlumbroso/free-disk-space/blob/main/action.yml - name: Free disk space run: | sudo rm -rf /usr/local/lib/android || true sudo rm -rf /usr/share/dotnet || true - name: Setup twine config if: inputs.pypirc run: | echo "${{ inputs.pypirc }}" > ~/.pypirc cat ~/.pypirc - uses: actions/download-artifact@v4 with: path: dist # Filter builds (eg vN+cu118 for instance) - run: ls -R dist/ - name: Extract builds to upload run: | set -ex mv dist all-dist mkdir dist for f in all-dist/${{ inputs.filter }}_${{ inputs.artifact_tag }}/*.whl; do cp $f dist/ done; - run: ls -R dist/ - name: Setup venv run: | python3 -m venv venv . ./venv/bin/activate which pip # (we need pytorch to create a source distr...) pip install torch packaging twine echo "PY=$(which python)" >> ${GITHUB_ENV} echo "PATH=$PATH" >> ${GITHUB_ENV} - name: Create source distribution env: VERSION_SOURCE: ${{ github.ref_type == 'tag' && 'tag' || 'dev' }} run: | version=`$PY .github/compute_wheel_version.py --source $VERSION_SOURCE` echo $version > version.txt cat version.txt BUILD_VERSION=$version $PY setup.py sdist -d sdist/ - run: ls -R sdist/ - name: Upload wheel to PyPi if: inputs.execute run: $PY -m twine upload --skip-existing dist/*.whl sdist/* env: TWINE_USERNAME: ${{ inputs.twine_username }} TWINE_PASSWORD: ${{ secrets.twine_password }} ================================================ FILE: .github/workflows/wheels_upload_s3.yml ================================================ name: wheels_upload_s3 on: workflow_call: inputs: aws_role: required: true type: string s3_path: required: true type: string description: Example 's3://bucket/path/xformers/' aws_s3_cp_extra_args: required: false type: string default: '' description: Example '--acl public-read' filter: required: true type: string description: Filter which runs to upload. Example '*+cu121*' execute: required: true type: boolean description: Actually upload the wheels. Dry-run if false artifact_tag: default: "facebookresearch" type: string jobs: wheels_upload_s3: permissions: id-token: write # Needed to assume AWS role pull-requests: read contents: read name: ${{ inputs.s3_path }} runs-on: ubuntu-24.04 timeout-minutes: 360 defaults: run: shell: bash steps: - name: Recursive checkout uses: actions/checkout@v3 with: submodules: recursive path: "." fetch-depth: 0 # for tags # inspired by https://github.com/jlumbroso/free-disk-space/blob/main/action.yml - name: Free disk space run: | sudo rm -rf /usr/local/lib/android || true sudo rm -rf /usr/share/dotnet || true - uses: actions/download-artifact@v4 with: path: dist # Filter builds (eg vN+cu118 for instance) - run: ls -R dist/ - name: Extract builds to upload run: | set -ex mv dist all-dist mkdir dist for f in all-dist/${{ inputs.filter }}_${{ inputs.artifact_tag }}/*.whl; do cp $f dist/ done; - run: ls -R dist/ - name: configure aws credentials if: inputs.execute uses: aws-actions/configure-aws-credentials@v1.7.0 with: role-to-assume: ${{ inputs.aws_role }} role-session-name: GitHub_CI aws-region: "us-east-1" - name: Sts GetCallerIdentity if: inputs.execute run: | aws sts get-caller-identity - name: Upload wheels to ${{ inputs.s3_path }} if: inputs.execute run: | set -ex for f in dist/*.whl; do echo $f; aws s3 cp $f ${{ inputs.s3_path }} ${{ inputs.aws_s3_cp_extra_args }} done; aws s3 ls ${{ inputs.s3_path }} ================================================ FILE: .github/workflows/win-build.yml ================================================ name: win-build on: pull_request: paths: - "third_party/**" - "xformers/csrc/**" - ".github/workflows/win-build.yml" - ".github/actions/setup-build-cuda/action.yml" - "setup.py" - "requirements*.txt" env: FORCE_CUDA: 1 MAX_JOBS: 6 DISTUTILS_USE_SDK: 1 # otherwise distutils will complain on windows about multiple versions of msvc XFORMERS_BUILD_TYPE: "Release" TMPDIR: "./x" jobs: win_build: strategy: fail-fast: false matrix: arch: - "8.0" name: win-build-${{ matrix.arch }} runs-on: windows-8-core env: PY: python3 TORCH_CUDA_ARCH_LIST: ${{ matrix.arch }} timeout-minutes: 360 defaults: run: shell: bash steps: - name: Workarounds for longpaths - git-config run: | git config --system core.longpaths true - name: Recursive checkout uses: actions/checkout@v3 with: submodules: recursive path: "." - name: Workarounds for longpaths - TMPDIR run: | mkdir x python -c "import tempfile; print(tempfile.gettempdir())" python -c "import tempfile; assert(len(tempfile.gettempdir()) < 30)" - name: HACKFIX for cutlass compiler bug if: runner.os == 'Windows' run: | # See https://github.com/NVIDIA/cutlass/issues/1732 rm -f third_party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp touch third_party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp - name: Setup Runner uses: ./.github/actions/setup-build-cuda with: toolkit_type: "cuda" toolkit_short_version: "130" python: "3.10" - name: Remove internal code run: | mkdir -p .github/sync.fairinternal/ touch .github/sync.fairinternal/ossify.sh chmod +x .github/sync.fairinternal/ossify.sh .github/sync.fairinternal/ossify.sh - name: Install build dependencies run: | $PY -m pip install wheel setuptools ninja -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu130 git config --global --add safe.directory "*" $PY -c "import torch; print('torch', torch.__version__)" $PY -c "import torch; print('torch.cuda', torch.version.cuda)" ninja --version - name: Create sdist run: $PY setup.py sdist - name: Build from sdist shell: bash -l {0} run: | $PY -m pip install -v --no-build-isolation dist/* - name: Info run: | cd ../../ # So we don't have a folder named `xformers` XFORMERS_MORE_DETAILS=1 $PY -m xformers.info # - name: Open an SSH session on failure to debug # if: ${{ failure() }} # uses: mxschmitt/action-tmate@v3 ================================================ FILE: .gitignore ================================================ *~ *.swp *.pyc *.pyo *.so .mypy_cache/ *.egg-info/ build/ dist/ # for autocomplete compile_commands.json # Pytest verbose output test-results/ # Coverage reports .coverage .coverage.* # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ .vscode/* xformers/benchmarks/LRA/datasets xformers/benchmarks/LRA/logs my_runs.md # Triton cache .cache # JetBrains PyCharm IDE .idea/ # Pyre cache .pyre/ # Watchman config files .watchmanconfig # examples demo files examples/input.txt examples/lightning_logs examples/data # Hydra default output dir multirun outputs .benchmarks xformers/version.py xformers/cpp_lib.json ## temporary files xformers/csrc/attention/hip_fmha/*.cu xformers/csrc/attention/hip_fmha/*.hip xformers/csrc/attention/hip_fmha/*_hip.h xformers/csrc/attention/hip_fmha/instances/*.cu xformers/csrc/attention/hip_fmha/instances/*.hip xformers/csrc/attention/hip_fmha/instances/*_hip.h xformers/csrc/attention/hip_decoder/*.cu xformers/csrc/attention/hip_decoder/*.hip xformers/csrc/attention/hip_decoder/*_hip.h ================================================ FILE: .gitmodules ================================================ [submodule "third_party/cutlass"] path = third_party/cutlass url = https://github.com/NVIDIA/cutlass.git [submodule "third_party/composable_kernel_tiled"] path = third_party/composable_kernel_tiled url = https://github.com/ROCm/composable_kernel.git branch = develop ================================================ FILE: .isort.cfg ================================================ [settings] known_third_party =fvcore,hydra,input_pipeline,matplotlib,numpy,omegaconf,pandas,pl_bolts,pyre_extensions,pytest,pytorch_lightning,ragged_inference,recommonmark,seaborn,setuptools,sklearn,submitit,tensorflow,timm,torch,torchmetrics,torchvision,tqdm,triton,typing_extensions skip_glob=third_party/* ================================================ FILE: .markdownlint.json ================================================ { "MD013": false, "MD033": false } ================================================ FILE: .pre-commit-config.yaml ================================================ exclude: 'build|stubs' default_language_version: python: python3 repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v3.4.0 hooks: - id: trailing-whitespace - id: check-ast - id: check-merge-conflict - id: no-commit-to-branch args: ['--branch=master'] - id: check-added-large-files args: ['--maxkb=500'] - id: end-of-file-fixer - repo: https://github.com/omnilib/ufmt rev: v2.8.0 hooks: - id: ufmt additional_dependencies: - black == 26.3.1 - usort == 1.0.8.post1 - repo: https://github.com/pycqa/flake8 rev: 6.1.0 hooks: - id: flake8 additional_dependencies: [flake8-copyright] - repo: https://github.com/pre-commit/mirrors-mypy rev: 'v1.10.0' hooks: - id: mypy ================================================ FILE: .pyre_configuration ================================================ { "ignore_all_errors": ["xformers/benchmarks/"], "python_version": "3.9", "source_directories": [ "stubs", {"import_root": ".", "source": "xformers"} ] } ================================================ FILE: CHANGELOG.md ================================================ # Changelog All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [0.0.36] - 2026-??-?? ## [0.0.35] - 2026-02-20 Pre-built binary wheels are available for PyTorch 2.10.0 (and later). ### Improved - Supported free-threading Python. ### Removed - Stopped bundling pre-built versions of Flash-Attention 3, and instead started relying on the wheels provided by the PyTorch indices. ## [0.0.34] - 2026-01-23 Pre-built binary wheels are available for PyTorch 2.10.0 (and later). ### Improved - Migrated xFormers to the PyTorch stable API/ABI, which means that binary builds targeting PyTorch 2.10+ will be compatible with any later version ### Removed - Removed optimized fast-path of SwiGLU (which was only available for A100 GPUs) - Removed most legacy components ## [0.0.33.post2] - 2025-12-03 Pre-built binary wheels are available for PyTorch 2.9.1. ## [0.0.33.post1] - 2025-11-13 Fixed issues with wheel upload to PyPI ## [0.0.33] - 2025-11-12 Pre-built binary wheels are available for PyTorch 2.9.0. ### Added - cutlass fmha Op for Blackwell GPUs - Support flash-attention package up to 2.8.3 - expose FA3 deterministic mode - FW+BW pass overlap for DeepSeek-like comms/compute overlap ### Improved - merge_attentions support for irregular head dimension ## [0.0.32] - 2025-08-13 Pre-built binary wheels are available for PyTorch 2.8.0. ### Added - Support flash-attention package up to 2.8.2 - Speed improvements to `python -m xformers.profiler.find_slowest` ### Removed - Removed autograd backward pass for merge_attentions as it is easy to use incorrectly. - Attention biases are no longer `torch.Tensor` subclasses. This is no longer necessary for torch.compile to work, and was adding more complexity ## [0.0.31] - 2025-06-25 Pre-built binary wheels are available for PyTorch 2.7.1. ### Added - xFormers wheels are now python-version agnostic: this means that the same wheel can be used for python 3.9, 3.10, ... 3.13 - Added support for Flash-Attention 3 on Ampere GPUs ### Removed - We will no longer support V100 or older GPUs, following PyTorch (pytorch/pytorch#147607) - Deprecated support for building Flash-Attention 2 as part of xFormers. For Ampere GPUs, we now use Flash-Attention 3 on windows, and Flash-Attention 2 can still be used through PyTorch on linux. ## [0.0.30] - 2025-04-28 Pre-built binary wheels are available for PyTorch 2.7.0. Following PyTorch, we build wheels for CUDA 11.8, 12.6, and 12.8 only (we no longer build for CUDA 12.4). xFormers now requires PyTorch >= 2.7 ### Added - [fMHA] Added support for local attention on the Flash3 backend (H100) - [fMHA] Added a new paged gappy attention bias ### Improved - [fMHA] The FlashAttention3 backend now ships with more head dimensions to support MLA, and with a FLOPs formula in order to be compatible with PyTorch's partitioner-base automatic activation checkpointing - The fused operators for sequence parallelism were migrated to PyTorch's SymmetricMemory - The profiler prepends the traces' filenames with the rank of the process when doing distributed training ### Removed - Removed documentation for legacy unmaintained components ## [0.0.29.post2] - 2025-01-31 Pre-built binary wheels are available for PyTorch 2.6.0. Following PyTorch, we build wheels for CUDA 11.8, 12.4, and 12.6 only (we no longer build for CUDA 12.1). xFormers now requires PyTorch >= 2.6 ## [0.0.29] - 2024-12-27 ### Improved: - [fMHA] Creating a `LowerTriangularMask` no longer creates a CUDA tensor - [fMHA] Updated Flash-Attention to `v2.7.2.post1` - [fMHA] Flash-Attention v3 will now be used by `memory_efficient_attention` by default when available, unless the operator is enforced with the `op` keyword-argument. Switching from Flash2 to Flash3 can make transformer trainings ~10% faster end-to-end on H100s - [fMHA] Fixed a performance regression with the `cutlass` backend for the backward pass (facebookresearch/xformers#1176) - mostly used on older GPUs (eg V100) - Fixed swiglu operator compatibility with torch-compile with PyTorch 2.6 - Fix activation checkpointing of SwiGLU when AMP is enabled (facebookresearch/xformers#1152) ### Removed: - Following PyTorch, xFormers no longer builds binaries for conda. Pip is now the only recommended way to get xFormers - Removed unmaintained/deprecated components in `xformers.components.*` (see facebookresearch/xformers#848) ## [0.0.28.post3] - 2024-10-30 Pre-built binary wheels require PyTorch 2.5.1 ## [0.0.28.post2] - 2024-10-18 Pre-built binary wheels require PyTorch 2.5.0 ## [0.0.28.post1] - 2024-09-13 Properly upload wheels for cuda 12.4 ## [0.0.28] - 2024-09-12 Pre-built binary wheels require PyTorch 2.4.1 ### Added - Added wheels for cuda 12.4 - Added conda builds for python 3.11 - Added wheels for rocm 6.1 ### Improved - Profiler: Fix computation of FLOPS for the attention when using xFormers - Profiler: Fix MFU/HFU calculation when multiple dtypes are used - Profiler: Trace analysis to compute MFU & HFU is now much faster - fMHA/splitK: Fixed `nan` in the output when using a `torch.Tensor` bias where a lot of consecutive keys are masked with `-inf` - Update Flash-Attention version to `v2.6.3` *when building from scratch* - When using the most recent version of Flash-Attention, it is no longer possible to mix it with the cutlass backend. In other words, it is no longer possible to use the cutlass Fw with the flash Bw. ### Removed - fMHA: Removed `decoder` and `small_k` backends - profiler: Removed `DetectSlowOpsProfiler` profiler - Removed compatibility with PyTorch < 2.4 - Removed conda builds for python 3.11 - Removed windows pip wheels for cuda 12.1 and 11.8 ## [0.0.27.post2] - 2024-07-26 Pre-built binary wheels require PyTorch 2.4.0 ## [0.0.27.post1] - 2024-07-25 Pre-built binary wheels require PyTorch 2.4.0 ## [0.0.27] - 2024-07-10 Pre-built binary wheels require PyTorch 2.3.1 ### Added - fMHA: `PagedBlockDiagonalGappyKeysMask` - fMHA: heterogeneous queries in `triton_splitk` - fMHA: support for paged attention in flash - fMHA: Added backwards pass for `merge_attentions` - fMHA: Added `torch.compile` support for 3 biases (`LowerTriangularMask`, `LowerTriangularMaskWithTensorBias` and `BlockDiagonalMask`) - some might require PyTorch 2.4 - fMHA: Added `torch.compile` support in `memory_efficient_attention` when passing the flash operator explicitely (eg `memory_efficient_attention(..., op=(flash.FwOp, flash.BwOp))`) - fMHA: `memory_efficient_attention` now expects its `attn_bias` argument to be on the same device as the other input tensor. Previously, it would convert the bias to the right device. - fMHA: `AttentionBias` subclasses are now constructed by default on the `cuda` device if available - they used to be created on the CPU device - 2:4 sparsity: Added `xformers.ops.sp24.sparsify24_ste` for Straight Through Estimator (STE) with options to rescale the gradient differently for masked out/kept values ### Improved - fMHA: Fixed out-of-bounds reading for Split-K triton implementation - Profiler: fix bug with modules that take a single tuple as argument - Profiler: Added manual trigger for a profiling step, by creating a `trigger` file in the profiling directory ### Removed - Removed support for PyTorch version older than 2.2 ## [0.0.26] - 2024-04-29 Pre-built binary wheels require PyTorch 2.3.0 ### Added - [2:4 sparsity] Added support for Straight-Through Estimator for `sparsify24` gradient (`GRADIENT_STE`) - [2:4 sparsity] `sparsify24_like` now supports the cuSparseLt backend, and the STE gradient - Basic support for `torch.compile` for the `memory_efficient_attention` operator. Currently only supports Flash-Attention, and without any bias provided. We want to expand this coverage progressively. ### Improved - merge_attentions no longer needs inputs to be stacked. - fMHA: triton_splitk now supports additive bias - fMHA: benchmark cleanup ## [0.0.25.post1] - 2024-03-29 Pre-built binary wheels require PyTorch 2.2.2 ## [0.0.25] - 2024-03-14 Pre-built binary wheels require PyTorch 2.2.1 ### Added - New `merge_attentions` function - fMHA: New gappy attention biases. ### Improved - fMHA: Updated Flash-Attention to v2.5.6: this has a performance improvement for multiquery. - fMHA: triton_splitk changed and expanded. Now amalgamates using LSE. Can autotune, supports causal with a small number of queries - not just 1. Experimental support for paged attention. - `rope_padded`: Fixed CUDA error with many queries (more than 65k) - `rmsnorm`: Fixed CUDA error with large inputs (enables 512k+ sequence length on Llama2 70B) ### Removed - fMHA: Removed triton operator (`fmha.triton.*`, `xformers.ops.MemoryEfficientAttentionTritonFwdFlashBwOp`, `xformers.ops.TritonFlashAttentionOp`), as it has correctness issues under some conditions, and is slower than other implementations. ## [0.0.24] - 2024-01-31 Pre-built binary wheels require PyTorch 2.2.0 ### Added - Added components for model/sequence parallelism, as near-drop-in replacements for FairScale/Megatron Column&RowParallelLinear modules. They support fusing communication and computation for sequence parallelism, thus making the communication effectively free. [Read more](https://twitter.com/d_haziza/status/1753030654118211593) - Added kernels for training models with 2:4-sparsity. We introduced a very fast kernel for converting a matrix A into 24-sparse format, which can be used during training to sparsify weights dynamically, activations etc... xFormers also provides an API that is compatible with torch-compile, see `xformers.ops.sparsify24`. ### Improved - Make selective activation checkpointing be compatible with torch.compile. ### Removed - Triton kernels now require a GPU with compute capability 8.0 at least (A100 or newer). This is due to newer versions of triton not supporting older GPUs correctly - Removed support for PyTorch version older than 2.1.0 ## [0.0.23] - 2023-12-05 Pre-built binary wheels require PyTorch 2.1.1 (xFormers `0.0.23`) or PyTorch 2.1.2 (xFormers `0.0.23.post1`). ### Fixed - fMHA: Fixed a bug in cutlass backend forward pass where the logsumexp was not correctly calculated, resulting in wrong results in the BW pass. This would happen with MQA when one sequence has a query with `length%64 == 1` - fMHA: Updated Flash-Attention to v2.3.6 - this fixes a performance regression in causal backward passes, and now supports `BlockDiagonalCausalWithOffsetPaddedKeysMask` ### Added - fMHA: Added `LocalAttentionFromBottomRightMask` (local) - fMHA: Added `LowerTriangularFromBottomRightMask` (causal) - fMHA: Added `LowerTriangularFromBottomRightLocalAttentionMask` (local + causal) ### Removed - Removed `xformers.triton.sum_strided` ## [0.0.22] - 2023-09-27 ### Fixed - fMHA: Backward pass now works in PyTorch deterministic mode (although slower) ### Added - fMHA: Added experimental support for Multi-Query Attention and Grouped-Query Attention. This is handled by passing 5-dimensional inputs to `memory_efficient_attention`, see the documentation for more details - fMHA: Added experimental support for Local Attention biases to `memory_efficient_attention` - Added an example of efficient [LLaMa decoding](https://github.com/facebookresearch/xformers/tree/main/examples/llama_inference) using xformers operators - Added Flash-Decoding for faster attention during Large Language Model (LLM) decoding - up to 50x faster for long sequences (token decoding up to 8x faster end-to-end) - Added an efficient rope implementation in triton, to be used in LLM decoding - Added selective activation checkpointing, which gives fine-grained control of which activations to keep and which activations to recompute - `xformers.info` now indicates the Flash-Attention version used ### Removed - fMHA: Removed `smallK` backend support for CPU. `memory_efficient_attention` only works for CUDA/GPU tensors now - **DEPRECATION**: Many classes in `xformers.factory`, `xformers.triton` and `xformers.components` have been or will be deprecated soon (see tracking issue facebookresearch/xformers#848) ## [0.0.21] - 2023-08-18 ### Improved - fMHA: Updated [flash-attention](https://github.com/Dao-AILab/flash-attention) to v2, with massive performance improvements for both the forward pass and backward pass. This implementation is now used by default when it's available ### Bug fixes - fMHA/cutlass: Fix potential race condition in the FW/BW passes - fMHA/cutlass: Fix `attn_bias` stride overflow for very long sequences (>32k) - `LowerTriangularMask` is now backward compatible with older xformers versions ### Breaking changes - `memory_efficient_attention` now expects the `attn_bias` argument to have a head dimension - `memory_efficient_attention` no longer broadcasts the batch/head dimensions of `attn_bias`. Please use `.expand` if you need to broadcast the bias - Remove `causal_diagonal` argument from `BlockDiagonalCausalWithOffsetPaddedKeysMask` ### Added - Binary wheels on pypi/conda now contain H100 kernels - fMHA: Added backend specialized for decoding that does not use TensorCores - useful when not using multiquery **NOTE**: Binary wheels are now provided only for PyTorch 2 with cuda 11.8. It is still possible to use xFormers with older versions of PyTorch by building from source or using conda. ## [0.0.20] - 2023-05-23 ### Improved - fMHA/cutlass (backward): Massive performance improvements when `batch_size * num_heads` is low (10x+) - fMHA/cutlass: Further performance improvements for both the forward & backward kernels - fMHA (backward): Now dispatching to cutlass when `embed_dim>64` - fMHA: Updated Flash-Attention to `v1.0.5` ### Added - fMHA now runs on H100 (support is experimental) ## [0.0.19] - 2023-04-28 ### Added - Display `nvcc` version used to compile `xformers` in `python -m xformers.info` ### Fixed - Fixed performance regression with `nvcc>11.6` (facebookresearch/xformers#712) - fMHA/cutlass: Fixed `nan` in the output when using a `torch.Tensor` with `-inf` prefixes as `attn_bias` (facebookresearch/xformers#722) - fMHA/cutlass: Fixed `nan` in the output when the sequence length is larger than `2 ** 15` (facebookresearch/xformers#719) - fMHA/cutlass: Significative performance improvements (up to 2x) for both the forward pass and backward pass - fMHA/cutlass: The kernel are now deterministic - fMHA/cutlass: Fixed backward pass correctness when using dropout (facebookresearch/xformers#724) ## [0.0.18] - 2023-03-31 ### Added - Added `xformers.ops.index_select_cat` and `xformers.ops.scaled_index_add` - those are experimental functions that only work with a few shapes, and can be used to write efficient stochastic depth in transformer architectures for instance ### Fixed - fMHA: `memory_efficient_attention` now accepts `torch.Tensor` as attention bias for any seqlen, although there are still requirements on the alignment of the bias tensor (see facebookresearch/xformers#683) ## [0.0.17] - 2023-03-28 ### Fixed - fMHA: Fixed BW pass on Sm86/Sm89 GPUs when `K > 64` (RTX 3090, RTX 4090, A6000, ..) [facebookresearch/xformers#631] ### Added - fMHA/CUTLASS: Added tensor attn bias support [facebookresearch/xformers#587] - contribution from [@jfc4050](https://github.com/jfc4050) - fMHA/CUTLASS: Added tensor attn bias grad support [facebookresearch/xformers#587] - contribution from [@jfc4050](https://github.com/jfc4050) - fMHA/CUTLASS: Added dropout support [facebookresearch/xformers#587] - contribution from [@jfc4050](https://github.com/jfc4050) - fMHA: Added support for varying sequence lengths [facebookresearch/xformers#500] ## [0.0.16] - 2023-01-31 ### Fixed - Updated triton dependency [facebookresearch/xformers#418] - Stripe lineinfo from binaries, reducing the binary size [facebookresearch/xformers#549] - Added support for pip wheels [facebookresearch/xformers#588, facebookresearch/xformers#573, facebookresearch/xformers#534, facebookresearch/xformers#523, ...] big thanks to [@AbdBarho](https://github.com/AbdBarho)! - Fixed compatibility with Python 3.7 [facebookresearch/xformers#541] - thanks to [@susumuota](https://github.com/susumuota) - fMHA: Fixed strides for QKV gradients for cutlass attention [facebookresearch/xformers#535] - fMHA: Stricter inputs validation to avoid CUDA errors for unsupported inputs [facebookresearch/xformers#592] - fMHA/Flash-Attention: Updated to https://github.com/HazyResearch/flash-attention/commit/a1f49a2b92b6fa022379bbebafed9d7f5e96a675 with multiple changes from [@TriDao](https://github.com/tridao) that make the operator up to 20% faster - fMHA/Flash-Attention: Fixed backward pass wrapper, where non-contiguous gradients could give the wrong result [facebookresearch/xformers#548] - fMHA: Separate each operator into forward and backward operators. It's now possible to use any combination of forward+backward (for instance Triton forward and Flash-Attention backward) [facebookresearch/xformers#560] ### Added - fMHA: Added Triton operator for forward pass from [Flash-Attention](https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attn_triton.py) authored by [@TriDao](https://github.com/tridao), will be automatically used on A100 when compatible - fMHA: Added [`xformers.ops.memory_efficient_attention_forward`](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.memory_efficient_attention_forward), [`xformers.ops.memory_efficient_attention_forward_requires_grad`](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.memory_efficient_attention_forward_requires_grad), [`xformers.ops.memory_efficient_attention_backward`](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.memory_efficient_attention_backward) for power-users who write custom autograd functions [facebookresearch/xformers#560] - fMHA: Support for custom scaling for the CUTLASS-based kernel [facebookresearch/xformers#530] - contribution from [@comaniac](https://github.com/comaniac) ## [0.0.15] - Skipped ## [0.0.14] - 2022-11-10 ### Fixed - fMHA/CUTLASS: The current CUDA stream is now used by the kernel [facebookresearch/xformers#491] - fMHA/CUTLASS: Improve overall performance ### Added - SwiGLU: Added `xformers.ops.SwiGLU` and its functional counterpart (`xformers.ops.swiglu`) [facebookresearch/xformers#490] - fMHA: Possible to combine CUTLASS's forward with flash-attention's backward pass [facebookresearch/xformers#469] - improves performance on A100 for K = 128 - fMHA: Add custom `xformers.ops.unbind` operator to avoid a cat in the attention block [facebookresearch/xformers#458] ## [0.0.13] - 2022-09-26 ### Added - fMHA: Added CUTLASS-based kernel for `xformers.ops.memory_efficient_attention`. This kernel is automatically depending on the inputs, and works on any GPU after P100 [facebookresearch/xformers#362] ## [0.0.12] - 2022-08-08 ### Fixed - Removed duplicated biases in the FusedMLP layers [facebookresearch/xformers#317] - Rotary embeddings respecting input types [facebookresearch/xformers#326] - Poolformer style instantiating useless projection layers [facebookresearch/xformers#349] - Fix layer position not being properly tracked, causing extra layernorms for programmatic xformers [facebookresearch/xformers#348] - Pass use_triton flag to LayerNorm module [facebookresearch/xformers#336] ### Added - Four blocksparsity layouts from DeepSpeed [facebookresearch/xformers#320] - Support several initialization options [facebookresearch/xformers#312] - Conv2DFeedforward feedforward part [facebookresearch/xformers#321] - VisualAttention [facebookresearch/xformers#329] - Automatic blocksparse for causal attention [facebookresearch/xformers#334] - Better hierarchical transformer generation [facebookresearch/xformers#345] - Fused operations with AOTAutograd/NVFuser, integration into MLP [facebookresearch/xformers#357] - Refactor LRA code to use Pytorch Lightning [facebookresearch/xformers#343] ## [0.0.11] - 2022-05-30 ### Fixed - Fix some torchscriptability [facebookresearch/xformers#246] - Fix FourierMix being compatible with AMP [facebookresearch/xformers#258] - Better asserts on QKV dimensions [facebookresearch/xformers#264] - Better perfs for FusedMLP and FusedLinearLayer [facebookresearch/xformers#283] - Deepnorm init missing self-attention [facebookresearch/xformers#284] ### Added - Simplicial Embeddings [facebookresearch/xformers#259] - Mem efficient attention, FW pass [facebookresearch/xformers#267] - MHA benchmark - MLP benchmark - Move all triton kernels to triton v2 [facebookresearch/xformers#272] - Mem efficient attention, BW pass [facebookresearch/xformers#281] - Metaformer support [facebookresearch/xformers#294] ## [0.0.10] - 2022-03-14 ### Fixed - Expose bias flag for feedforwards, same default as Timm [facebookresearch/xformers#220] - Update eps value for layernorm, same default as torch [facebookresearch/xformers#221] - PreNorm bugfix, only one input was normalized [facebookresearch/xformers#233] - Fix bug where embedding dimensions that did not match model dim would lead to a crash [facebookresearch/xformers#244] ### Added - Add DeepNet (DeepNorm) residual path and init [facebookresearch/xformers#227] ## [0.0.9] - 2022-02-09 ### Added - Compositional Attention [facebookresearch/xformers#41] - Experimental Ragged attention [facebookresearch/xformers#189] - Mixture of Experts [facebookresearch/xformers#181] - BlockSparseTensor [facebookresearch/xformers#202] - Nd-tensor support for triton softmax [facebookresearch/xformers#210] ### Fixed - Bugfix Favor, single feature map [facebookresearch/xformers#183] - Sanity check blocksparse settings [facebookresearch/xformers#207] - Fixed some picklability [facebookresearch/xformers#204] ## [0.0.8] - 2022-01-07 ### Fixed - Much faster fused dropout [facebookresearch/xformers#164] - Fused dropout repeatability [facebookresearch/xformers#173] ### Added - Embedding weight tying option [facebookresearch/xformers#172] ## [0.0.7] - 2021-11-30 ### Fixed - Dropout setting not properly passed in many attentions [facebookresearch/xformers#123] ## [0.0.6] - 2021-11-24 ### Fixed - Fix self attention optimization not being triggered, broken residual path [facebookresearch/xformers#119] - Improve speed by not using contiguous Tensors when not needed [facebookresearch/xformers#119] ### Added - Attention mask wrapper [facebookresearch/xformers#113] - ViT comparison benchmark [facebookresearch/xformers#117] ## [0.0.4] - 2021-11-16 ### Fixed - Homogenizing the masks, additive or bool [facebookresearch/xformers#79][facebookresearch/xformers#85][facebookresearch/xformers#86] - Fix causality flag not being respected [facebookresearch/xformers#103] - Enabling FusedLayerNorm by default in the factory if Triton is available - Fixing Favor with fp16 - Fixing Favor trainability ### Added - Fused dropout/bias/activation layer [facebookresearch/xformers#58] - Fused layernorm used by default in the factory [facebookresearch/xformers#92] ## [0.0.3] - 2021-11-01 ### Fixed - Nystrom causal attention [facebookresearch/xformers#75] ## [0.0.2] - 2021-11-01 ### Fixed - More robust blocksparse [facebookresearch/xformers#24] ### Added - Rotary embeddings [facebookresearch/xformers#32] - More flexible layernorm [facebookresearch/xformers#50] ================================================ FILE: CODE_OF_CONDUCT.md ================================================ # Code of Conduct ## Our Pledge In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to make participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. ## Our Standards Examples of behavior that contributes to creating a positive environment include: * Using welcoming and inclusive language * Being respectful of differing viewpoints and experiences * Gracefully accepting constructive criticism * Focusing on what is best for the community * Showing empathy towards other community members Examples of unacceptable behavior by participants include: * The use of sexualized language or imagery and unwelcome sexual attention or advances * Trolling, insulting/derogatory comments, and personal or political attacks * Public or private harassment * Publishing others' private information, such as a physical or electronic address, without explicit permission * Other conduct which could reasonably be considered inappropriate in a professional setting ## Our Responsibilities Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. ## Scope This Code of Conduct applies within all project spaces, and it also applies when an individual is representing the project or its community in public spaces. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. This Code of Conduct also applies outside the project spaces when there is a reasonable belief that an individual's behavior may have a negative impact on the project or its community. ## Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at . All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. ## Attribution This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html [homepage]: https://www.contributor-covenant.org For answers to common questions about this code of conduct, see https://www.contributor-covenant.org/faq ================================================ FILE: CONTRIBUTING.md ================================================ # Contributing to the xFormers repo We want to make contributing to this project as easy and transparent as possible. ## Our Development Process Minor changes and improvements will be released on an ongoing basis. Larger changes (e.g., changesets implementing a new paper) will be released on a more periodic basis. ## Pull Requests We actively welcome your pull requests. 1. Fork the repo and create your branch from `main`. 2. If you've added code that should be tested, add tests. 3. If you've changed APIs, update the documentation. 4. Ensure the test suite passes. 5. Make sure your code lints. 6. If you haven't already, complete the Contributor License Agreement ("CLA"). ## Contributor License Agreement ("CLA") In order to accept your pull request, we need you to submit a CLA. You only need to do this once to work on any of Facebook's open source projects. Complete your CLA here: ## Issues We use GitHub issues to track public bugs. Please ensure your description is clear and has sufficient instructions to be able to reproduce the issue. Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe disclosure of security bugs. In those cases, please go through the process outlined on that page and do not file a public issue. ## Environment setup ```bash ~$ python3 -m venv venv2 ~$ source venv2/bin/activate (venv2) ~$ cd git/template/ (venv2) ~/git/template $ pip3 install -r requirements-test.txt ``` ## Coding Style In your editor, install the [editorconfig](https://editorconfig.org/) extension which should ensure that you are following the same standards as us. Two options to make sure that the code is formatted and linted properly: * either you run mypy and ufmt before opening up your PR. ```bash ufmt format flake8 --config .flake8 mypy --ignore-missing-imports --scripts-are-modules --pretty --exclude build/ --exclude stubs/ . ``` * or you can just install [pre-commit](https://pre-commit.com/), which will make sure that all of the above is run automatically anytime you commit in that case, you would need to ```bash pip install pre-commit ``` then (in the xformers repository, just once) ```bash pre-commit install ``` After these steps each of your commits will run the same linting and formatting routines as the xformers continuous integration, which greatly helps getting your PRs all green ! _Read the [editorconfig](.editorconfig) file to understand the exact coding style preferences._ ## Testing ### Static analysis ```bash mypy --ignore-missing-imports --scripts-are-modules --pretty --exclude stubs/ . ``` ### Unit tests ```bash pytest ``` or ``` bash python -m pytest ``` ### Check test coverage ``` bash python -m pytest --cov-report term --cov=template tests ``` ### CircleCI status From your PR page, you can expand on the CircleCI results. For GPU test, you should see what CI has run, like: ``` bash ... ----- generated xml file: /home/circleci/template/test-results/junit.xml ------ ================== 217 passed, 2 xfailed in 218.74s (0:03:38) ================== CircleCI received exit code 0 ``` The number of passed and failed should give you an idea on whether your local test was the same or not. ## Commit Guidelines We follow the same guidelines as AngularJS. Each commit message consists of a **header**, a **body** and a **footer**. The header has a special format that includes a **type**, and a **subject**: ```bash []