gitextract_37zz2va9/

├── .all_crates.sh
├── .change_crate_dep.sh
├── .clang-format
├── .github/
│   ├── dependabot.yml
│   └── workflows/
│       ├── asan.yml
│       ├── binaries.yml
│       ├── cost_model.yml
│       ├── crates.yml
│       ├── cross-platform.yml
│       ├── examples.yml
│       ├── full.yml
│       ├── large_models.yml
│       ├── pydoc.yml
│       ├── release.yml
│       ├── tract-ci-bench.yml
│       ├── wheels.yml
│       └── windows.yml
├── .gitignore
├── .travis/
│   ├── README.md
│   ├── android-ndk.sh
│   ├── asan.sh
│   ├── bundle-entrypoint.sh
│   ├── cache_file.sh
│   ├── cargo-deny-check.sh
│   ├── ci-system-setup.sh
│   ├── cli-tests.sh
│   ├── cost_model_task_build.sh
│   ├── cross.sh
│   ├── debug-tests.sh
│   ├── docker-debian-stretch/
│   │   ├── Dockerfile
│   │   └── sources.list
│   ├── examples.sh
│   ├── llm-expectations-541
│   ├── make_bundle.sh
│   ├── minion.sh
│   ├── minionrc
│   ├── native.sh
│   ├── onnx-tests.sh
│   ├── regular-tests.sh
│   ├── run-bundle.sh
│   ├── run_all.sh
│   ├── setup-sccache.sh
│   ├── test-harness.sh
│   ├── test-llm.sh
│   ├── test-published-crates.sh
│   ├── test-rt.sh
│   ├── tf.sh
│   ├── tflite/
│   │   ├── Dockerfile.tensorflow-aarch64
│   │   ├── Dockerfile.tensorflow-official-rpi
│   │   ├── Dockerfile.tensorflow-rpitools
│   │   ├── build_tflite_aarch64.sh
│   │   ├── build_tflite_raspbian.sh
│   │   ├── convert_all.sh
│   │   ├── linux_makefile.inc
│   │   └── run_all.sh
│   ├── tflite.sh
│   └── travis.sh
├── .travis.yml
├── .vim/
│   └── coc-settings.json
├── CHANGELOG.md
├── Cargo.toml
├── LICENSE
├── LICENSE-APACHE
├── LICENSE-MIT
├── README.md
├── api/
│   ├── .gitignore
│   ├── Cargo.toml
│   ├── LICENSE
│   ├── LICENSE-APACHE
│   ├── LICENSE-MIT
│   ├── c/
│   │   ├── .gitignore
│   │   ├── Makefile
│   │   ├── grace_hopper_3_224_224.f32.raw
│   │   └── mobilenet.c
│   ├── ffi/
│   │   ├── Cargo.toml
│   │   ├── cbindgen.toml
│   │   └── src/
│   │       └── lib.rs
│   ├── generate-tract-h.sh
│   ├── proxy/
│   │   ├── Cargo.toml
│   │   ├── LICENSE
│   │   ├── LICENSE-APACHE
│   │   ├── LICENSE-MIT
│   │   ├── ci.sh
│   │   ├── src/
│   │   │   └── lib.rs
│   │   ├── sys/
│   │   │   ├── Cargo.toml
│   │   │   ├── build.rs
│   │   │   ├── src/
│   │   │   │   └── lib.rs
│   │   │   └── tract.h
│   │   └── tests/
│   │       └── mobilenet.rs
│   ├── py/
│   │   ├── .gitignore
│   │   ├── MANIFEST.in
│   │   ├── _static/
│   │   │   ├── redirect-index.html
│   │   │   └── version-switcher.js
│   │   ├── conf.py
│   │   ├── docs/
│   │   │   ├── fact.md
│   │   │   ├── index.md
│   │   │   ├── inference_model.md
│   │   │   ├── model.md
│   │   │   ├── nnef.md
│   │   │   ├── onnx.md
│   │   │   ├── runnable.md
│   │   │   └── tensor.md
│   │   ├── grace_hopper_1x3x224x244.npy
│   │   ├── pyproject.toml
│   │   ├── requirements-docs.txt
│   │   ├── requirements.txt
│   │   ├── setup.py
│   │   ├── tests/
│   │   │   └── mobilenet_onnx_test.py
│   │   └── tract/
│   │       ├── __init__.py
│   │       ├── bindings.py
│   │       ├── dim.py
│   │       ├── fact.py
│   │       ├── inference_model.py
│   │       ├── model.py
│   │       ├── nnef.py
│   │       ├── onnx.py
│   │       ├── runnable.py
│   │       ├── runtime.py
│   │       ├── state.py
│   │       ├── tensor.py
│   │       └── transform.py
│   ├── rs/
│   │   ├── Cargo.toml
│   │   ├── LICENSE
│   │   ├── LICENSE-APACHE
│   │   ├── LICENSE-MIT
│   │   ├── src/
│   │   │   └── lib.rs
│   │   └── tests/
│   │       └── mobilenet.rs
│   ├── src/
│   │   ├── lib.rs
│   │   ├── macros.rs
│   │   └── transform.rs
│   └── tests/
│       ├── grace_hopper_3_224_224.f32.raw
│       └── mobilenet/
│           └── mod.rs
├── ci/
│   └── tract-ci-minion/
│       ├── .gitignore
│       ├── Cargo.toml
│       ├── minion.toml.example
│       └── src/
│           └── main.rs
├── cli/
│   ├── Cargo.toml
│   ├── LICENSE
│   ├── LICENSE-APACHE
│   ├── LICENSE-MIT
│   └── src/
│       ├── bench.rs
│       ├── compare.rs
│       ├── cost.rs
│       ├── dump.rs
│       ├── hwbench.rs
│       ├── llm.rs
│       ├── macros.rs
│       ├── main.rs
│       ├── memory_arena.rs
│       ├── model.rs
│       ├── params.rs
│       ├── plan_options.rs
│       ├── run.rs
│       ├── runtimes.rs
│       ├── tensor.rs
│       └── utils.rs
├── core/
│   ├── Cargo.toml
│   ├── LICENSE
│   ├── LICENSE-APACHE
│   ├── LICENSE-MIT
│   ├── src/
│   │   ├── axes/
│   │   │   ├── mapping.rs
│   │   │   ├── mod.rs
│   │   │   └── model.rs
│   │   ├── broadcast.rs
│   │   ├── floats.rs
│   │   ├── framework.rs
│   │   ├── late_bind.rs
│   │   ├── lib.rs
│   │   ├── macros.rs
│   │   ├── model/
│   │   │   ├── fact.rs
│   │   │   ├── graph.rs
│   │   │   ├── helpers.rs
│   │   │   ├── memory.rs
│   │   │   ├── mod.rs
│   │   │   ├── node.rs
│   │   │   ├── order.rs
│   │   │   ├── patch.rs
│   │   │   ├── rewriter.rs
│   │   │   ├── translator.rs
│   │   │   └── typed.rs
│   │   ├── ops/
│   │   │   ├── array/
│   │   │   │   ├── broadcast.rs
│   │   │   │   ├── concat.rs
│   │   │   │   ├── dyn_slice.rs
│   │   │   │   ├── gather.rs
│   │   │   │   ├── gather_elements.rs
│   │   │   │   ├── gather_nd.rs
│   │   │   │   ├── mod.rs
│   │   │   │   ├── one_hot.rs
│   │   │   │   ├── pad.rs
│   │   │   │   ├── range.rs
│   │   │   │   ├── reshape.rs
│   │   │   │   ├── scatter_elements.rs
│   │   │   │   ├── scatter_nd.rs
│   │   │   │   ├── slice.rs
│   │   │   │   ├── strided_slice.rs
│   │   │   │   ├── tile.rs
│   │   │   │   ├── topk.rs
│   │   │   │   └── trilu.rs
│   │   │   ├── binary.rs
│   │   │   ├── cast.rs
│   │   │   ├── change_axes.rs
│   │   │   ├── cnn/
│   │   │   │   ├── conv/
│   │   │   │   │   ├── block_quant.rs
│   │   │   │   │   ├── conv.rs
│   │   │   │   │   ├── depth_wise.rs
│   │   │   │   │   ├── im2col.rs
│   │   │   │   │   ├── lazy_im2col.rs
│   │   │   │   │   ├── mod.rs
│   │   │   │   │   └── q_sum_b.rs
│   │   │   │   ├── deconv/
│   │   │   │   │   ├── deconv.rs
│   │   │   │   │   ├── deconv_sum.rs
│   │   │   │   │   └── mod.rs
│   │   │   │   ├── maxpool.rs
│   │   │   │   ├── mod.rs
│   │   │   │   ├── padding.rs
│   │   │   │   ├── patch_axis.rs
│   │   │   │   ├── patches.rs
│   │   │   │   ├── pools.rs
│   │   │   │   └── sumpool.rs
│   │   │   ├── downsample/
│   │   │   │   ├── array.rs
│   │   │   │   ├── conv.rs
│   │   │   │   ├── mod.rs
│   │   │   │   └── scan.rs
│   │   │   ├── dummy.rs
│   │   │   ├── einsum/
│   │   │   │   ├── as_blas.rs
│   │   │   │   ├── einsum_matmul.rs
│   │   │   │   ├── eval.rs
│   │   │   │   ├── kernel_selection.rs
│   │   │   │   ├── mod.rs
│   │   │   │   ├── prefix_matmul.rs
│   │   │   │   └── proptest.rs
│   │   │   ├── element_wise.rs
│   │   │   ├── fft.rs
│   │   │   ├── identity.rs
│   │   │   ├── konst.rs
│   │   │   ├── logic/
│   │   │   │   ├── comparison.rs
│   │   │   │   └── ite.rs
│   │   │   ├── logic.rs
│   │   │   ├── macros.rs
│   │   │   ├── math/
│   │   │   │   ├── complex.rs
│   │   │   │   └── mod.rs
│   │   │   ├── matmul/
│   │   │   │   ├── de_block_quant.rs
│   │   │   │   ├── mod.rs
│   │   │   │   ├── optimized.rs
│   │   │   │   ├── pack.rs
│   │   │   │   └── quant.rs
│   │   │   ├── memory/
│   │   │   │   ├── force_eval.rs
│   │   │   │   ├── load.rs
│   │   │   │   ├── mod.rs
│   │   │   │   └── store.rs
│   │   │   ├── mod.rs
│   │   │   ├── nn/
│   │   │   │   ├── data_formats.rs
│   │   │   │   ├── gelu_approximate.rs
│   │   │   │   ├── mod.rs
│   │   │   │   ├── reduce.rs
│   │   │   │   ├── rms_norm.rs
│   │   │   │   ├── silu.rs
│   │   │   │   └── softmax/
│   │   │   │       ├── fixedpoint.rs
│   │   │   │       ├── math.rs
│   │   │   │       └── mod.rs
│   │   │   ├── quant.rs
│   │   │   ├── scan/
│   │   │   │   ├── decluttered.rs
│   │   │   │   ├── mod.rs
│   │   │   │   └── optimized.rs
│   │   │   ├── source.rs
│   │   │   ├── submodel.rs
│   │   │   └── unimpl.rs
│   │   ├── optim/
│   │   │   ├── change_axes.rs
│   │   │   ├── concat_then_einsum.rs
│   │   │   ├── mod.rs
│   │   │   ├── op_optim.rs
│   │   │   ├── prop_const.rs
│   │   │   ├── propagate_roi.rs
│   │   │   ├── push_split_down.rs
│   │   │   ├── slice.rs
│   │   │   └── uniform_mask.rs
│   │   ├── plan.rs
│   │   ├── runtime.rs
│   │   ├── transform.rs
│   │   └── value.rs
│   └── test_data/
│       └── test_data.cfg
├── cuda/
│   ├── Cargo.toml
│   ├── benches/
│   │   └── cuda_flash.rs
│   └── src/
│       ├── context.rs
│       ├── kernels/
│       │   ├── array/
│       │   │   ├── cast.rs
│       │   │   ├── copy.rs
│       │   │   ├── dispatch.rs
│       │   │   ├── mod.rs
│       │   │   └── rotate_half.rs
│       │   ├── binary.rs
│       │   ├── conv.rs
│       │   ├── conv_cudnn.rs
│       │   ├── cu/
│       │   │   ├── array.cu
│       │   │   ├── binary.cu
│       │   │   ├── cnn.cu
│       │   │   ├── common.cuh
│       │   │   ├── element_wise.cu
│       │   │   ├── flash_attn.cu
│       │   │   ├── ggml_flash_attn.cu
│       │   │   ├── mm_mv.cu
│       │   │   ├── mm_mv_q.cu
│       │   │   ├── nn.cu
│       │   │   └── quantize.cu
│       │   ├── element_wise.rs
│       │   ├── flash_attn.rs
│       │   ├── ggml_flash_attn.rs
│       │   ├── iff.rs
│       │   ├── launch_args.rs
│       │   ├── matmul/
│       │   │   ├── mod.rs
│       │   │   └── quant_act_q81.rs
│       │   ├── mod.rs
│       │   ├── nn/
│       │   │   ├── apply_rope.rs
│       │   │   ├── gelu_approximate.rs
│       │   │   ├── leaky_relu.rs
│       │   │   ├── mod.rs
│       │   │   ├── reduce.rs
│       │   │   ├── rms_norm.rs
│       │   │   ├── scaled_masked_softmax.rs
│       │   │   └── softmax.rs
│       │   └── utils.rs
│       ├── lib.rs
│       ├── ops/
│       │   ├── conv.rs
│       │   ├── flash_attn.rs
│       │   ├── fused_axis_op.rs
│       │   ├── gemm.rs
│       │   ├── ggml_flash_attn.rs
│       │   ├── iff.rs
│       │   ├── mod.rs
│       │   └── quant_q81.rs
│       ├── rewrite_rules/
│       │   ├── add_matmul_broadcast.rs
│       │   ├── fuse_axis_op.rs
│       │   ├── mod.rs
│       │   ├── pad_q40_weights.rs
│       │   └── untranspose_matmul_output.rs
│       ├── tensor.rs
│       ├── transform.rs
│       └── utils.rs
├── data/
│   ├── Cargo.toml
│   ├── LICENSE
│   ├── LICENSE-APACHE
│   ├── LICENSE-MIT
│   ├── benches/
│   │   ├── stack_tensors.rs
│   │   └── tensor_from_datum.rs
│   └── src/
│       ├── blob.rs
│       ├── datum.rs
│       ├── dim/
│       │   ├── assertion.rs
│       │   ├── mod.rs
│       │   ├── parse.rs
│       │   ├── resolve.rs
│       │   ├── sym.rs
│       │   └── tree.rs
│       ├── exotic.rs
│       ├── lib.rs
│       ├── macros.rs
│       ├── scatter.rs
│       ├── tensor/
│       │   ├── litteral.rs
│       │   ├── plain_view.rs
│       │   ├── storage.rs
│       │   └── view.rs
│       └── tensor.rs
├── deny.toml
├── doc/
│   ├── README.md
│   ├── cli-recipe.md
│   ├── graph.md
│   ├── intro.md
│   ├── kernel-notes.md
│   ├── nnef/
│   │   ├── tract-core.nnef
│   │   ├── tract-onnx.nnef
│   │   ├── tract-pulse.nnef
│   │   └── tract-resource.nnef
│   └── op.md
├── examples/
│   ├── .gitignore
│   ├── causal_llm/
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   ├── ci.sh
│   │   ├── scripts/
│   │   │   └── generate_ci_llm_assets.sh
│   │   └── src/
│   │       ├── bin/
│   │       │   ├── client.rs
│   │       │   ├── common/
│   │       │   │   └── mod.rs
│   │       │   ├── complete.rs
│   │       │   └── serve.rs
│   │       └── lib.rs
│   ├── face_detection_yolov8onnx_example/
│   │   ├── .gitignore
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   ├── ci.sh
│   │   └── src/
│   │       └── main.rs
│   ├── face_similarity_arcface_onnx/
│   │   ├── .gitignore
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   ├── ci.sh
│   │   └── src/
│   │       ├── arc_face.rs
│   │       ├── main.rs
│   │       └── yolo_face.rs
│   ├── keras-tract-tf2/
│   │   ├── .gitignore
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   ├── ci.sh.nope
│   │   ├── example.py
│   │   ├── io.npz
│   │   ├── requirements.txt
│   │   └── src/
│   │       └── main.rs
│   ├── nemo-nemotron-asr/
│   │   ├── .gitignore
│   │   ├── Cargo.toml
│   │   ├── ci.sh
│   │   ├── nemotron.py
│   │   └── src/
│   │       └── main.rs
│   ├── nemo-parakeet-asr/
│   │   ├── .gitignore
│   │   ├── Cargo.toml
│   │   ├── ci.sh
│   │   ├── parakeet.py
│   │   └── src/
│   │       └── main.rs
│   ├── nnef-dump-mobilenet-v2/
│   │   ├── .gitignore
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   ├── ci.sh
│   │   └── src/
│   │       └── main.rs
│   ├── nnef-mobilenet-v2/
│   │   ├── .gitignore
│   │   ├── Cargo.toml
│   │   ├── ci.sh
│   │   ├── imagenet_slim_labels.txt
│   │   └── src/
│   │       └── main.rs
│   ├── nnef-mobilenet-v2-api/
│   │   ├── .gitignore
│   │   ├── Cargo.toml
│   │   ├── ci.sh
│   │   ├── imagenet_slim_labels.txt
│   │   └── src/
│   │       └── main.rs
│   ├── onnx-mobilenet-v2/
│   │   ├── .gitignore
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   ├── ci.sh
│   │   ├── imagenet_slim_labels.txt
│   │   └── src/
│   │       ├── bin/
│   │       │   └── dyn-shape.rs
│   │       └── main.rs
│   ├── pytorch-albert-v2/
│   │   ├── .gitignore
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   ├── ci.sh
│   │   ├── export.py
│   │   └── src/
│   │       └── main.rs
│   ├── pytorch-resnet/
│   │   ├── .gitignore
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   ├── ci.sh
│   │   ├── export.py
│   │   ├── requirements.txt
│   │   └── src/
│   │       └── main.rs
│   ├── stable-diffusion/
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   ├── ci-gpu.sh
│   │   ├── export.py
│   │   ├── reference.py
│   │   └── src/
│   │       └── main.rs
│   ├── stable-diffusion-3/
│   │   ├── Cargo.toml
│   │   ├── export.py
│   │   ├── reference.py
│   │   ├── runme.sh
│   │   └── src/
│   │       └── main.rs
│   ├── stable-diffusion-xl/
│   │   ├── Cargo.toml
│   │   ├── ci-gpu.sh
│   │   ├── export.py
│   │   ├── reference.py
│   │   └── src/
│   │       └── main.rs
│   ├── tensorflow-mobilenet-v2/
│   │   ├── .gitignore
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   ├── ci.sh
│   │   ├── imagenet_slim_labels.txt
│   │   └── src/
│   │       └── main.rs
│   └── tflite-mobilenet-v3/
│       ├── .gitignore
│       ├── Cargo.toml
│       ├── README.md
│       ├── ci.sh
│       ├── imagenet_slim_labels.txt
│       └── src/
│           └── main.rs
├── extra/
│   ├── Cargo.toml
│   └── src/
│       ├── exp_unit_norm.rs
│       └── lib.rs
├── gpu/
│   ├── Cargo.toml
│   └── src/
│       ├── device.rs
│       ├── fact.rs
│       ├── lib.rs
│       ├── memory/
│       │   ├── mod.rs
│       │   ├── pool.rs
│       │   └── schema.rs
│       ├── ops/
│       │   ├── RECIPE.md
│       │   ├── apply_rope.rs
│       │   ├── binary.rs
│       │   ├── broadcast.rs
│       │   ├── cast.rs
│       │   ├── change_axes.rs
│       │   ├── concat.rs
│       │   ├── copy_based.rs
│       │   ├── dyn_kv_cache.rs
│       │   ├── element_wise.rs
│       │   ├── gelu_approximate.rs
│       │   ├── iff.rs
│       │   ├── leaky_relu.rs
│       │   ├── mod.rs
│       │   ├── pulse.rs
│       │   ├── reduce.rs
│       │   ├── rms_norm.rs
│       │   ├── rotate_half.rs
│       │   ├── scaled_masked_softmax.rs
│       │   ├── slice.rs
│       │   └── softmax.rs
│       ├── rewrite_rules/
│       │   ├── mod.rs
│       │   ├── rewire_sdpa.rs
│       │   ├── rewire_syncs.rs
│       │   └── rms_norm.rs
│       ├── session_handler.rs
│       ├── sync.rs
│       ├── tensor/
│       │   ├── arena_view.rs
│       │   ├── mod.rs
│       │   └── owned.rs
│       └── utils.rs
├── harness/
│   ├── core-proptest-pulse/
│   │   ├── Cargo.toml
│   │   └── src/
│   │       ├── conv_plus_conv.rs
│   │       ├── deconv.rs
│   │       ├── delay_plus_downsample.rs
│   │       ├── delay_plus_pool.rs
│   │       ├── einsum.rs
│   │       ├── lib.rs
│   │       └── pad_plus_conv.rs
│   ├── nemotron-speech-streaming-en-0.6b/
│   │   └── ci.sh
│   ├── nnef-inceptionv3/
│   │   ├── Cargo.toml
│   │   ├── download.sh
│   │   └── src/
│   │       └── lib.rs
│   ├── nnef-test-cases/
│   │   ├── .gitignore
│   │   ├── conv-bias/
│   │   │   ├── expected
│   │   │   ├── graph.nnef
│   │   │   └── runme.sh
│   │   ├── conv-q40/
│   │   │   ├── conv2d/
│   │   │   │   ├── io.npz
│   │   │   │   ├── model.nnef.tgz
│   │   │   │   └── runme.sh
│   │   │   ├── conv_base_kernel1/
│   │   │   │   ├── io.npz
│   │   │   │   ├── model.nnef.tgz
│   │   │   │   └── runme.sh
│   │   │   ├── conv_base_kernel3/
│   │   │   │   ├── io.npz
│   │   │   │   ├── model.nnef.tgz
│   │   │   │   └── runme.sh
│   │   │   ├── conv_base_kernel9/
│   │   │   │   ├── io.npz
│   │   │   │   ├── model.nnef.tgz
│   │   │   │   └── runme.sh
│   │   │   ├── conv_dilation2/
│   │   │   │   ├── io.npz
│   │   │   │   ├── model.nnef.tgz
│   │   │   │   └── runme.sh
│   │   │   ├── conv_dilation4/
│   │   │   │   ├── io.npz
│   │   │   │   ├── model.nnef.tgz
│   │   │   │   └── runme.sh
│   │   │   ├── conv_dilation8/
│   │   │   │   ├── io.npz
│   │   │   │   ├── model.nnef.tgz
│   │   │   │   └── runme.sh
│   │   │   ├── conv_groups2/
│   │   │   │   ├── io.npz
│   │   │   │   ├── model.nnef.tgz
│   │   │   │   └── runme.sh
│   │   │   ├── conv_groups4/
│   │   │   │   ├── io.npz
│   │   │   │   ├── model.nnef.tgz
│   │   │   │   └── runme.sh
│   │   │   ├── conv_insize128/
│   │   │   │   ├── io.npz
│   │   │   │   ├── model.nnef.tgz
│   │   │   │   └── runme.sh
│   │   │   ├── conv_insize64/
│   │   │   │   ├── io.npz
│   │   │   │   ├── model.nnef.tgz
│   │   │   │   └── runme.sh
│   │   │   ├── conv_stride2/
│   │   │   │   ├── io.npz
│   │   │   │   ├── model.nnef.tgz
│   │   │   │   └── runme.sh
│   │   │   └── conv_stride3/
│   │   │       ├── io.npz
│   │   │       ├── model.nnef.tgz
│   │   │       └── runme.sh
│   │   ├── conv-with-batch/
│   │   │   ├── graph.nnef
│   │   │   └── runme.sh
│   │   ├── debox/
│   │   │   ├── debox_base/
│   │   │   │   ├── graph.nnef
│   │   │   │   ├── io.npz
│   │   │   │   └── runme.sh
│   │   │   └── debox_high_dim/
│   │   │       ├── graph.nnef
│   │   │       ├── io.npz
│   │   │       └── runme.sh
│   │   ├── dyn_slice/
│   │   │   ├── graph.nnef
│   │   │   └── runme.sh
│   │   ├── fixed_roll/
│   │   │   ├── graph.nnef
│   │   │   ├── io.npz
│   │   │   └── runme.sh
│   │   ├── memory-arena/
│   │   │   ├── expected.json
│   │   │   └── runme.sh
│   │   ├── pool-padding/
│   │   │   ├── graph.nnef
│   │   │   └── runme.sh
│   │   ├── q40_linear_followed_slice/
│   │   │   ├── graph.nnef
│   │   │   └── runme.sh
│   │   ├── qmul/
│   │   │   ├── graph.nnef
│   │   │   ├── graph.quant
│   │   │   ├── io.npz
│   │   │   └── runme.sh
│   │   ├── range-slice-dyn-tile/
│   │   │   ├── model.nnef.tgz
│   │   │   └── runme.sh
│   │   ├── reshape/
│   │   │   ├── graph.nnef
│   │   │   └── runme.sh
│   │   ├── reshape_with_bc/
│   │   │   ├── graph.nnef
│   │   │   └── runme.sh
│   │   ├── sdpa/
│   │   │   ├── simple-causal-f32/
│   │   │   │   ├── graph.nnef
│   │   │   │   ├── io.npz
│   │   │   │   └── runme.sh
│   │   │   ├── simple-f16/
│   │   │   │   ├── graph.nnef
│   │   │   │   ├── io.npz
│   │   │   │   └── runme.sh
│   │   │   ├── simple-grouped-query-att-f32/
│   │   │   │   ├── io.npz
│   │   │   │   ├── model.nnef.tgz
│   │   │   │   └── runme.sh
│   │   │   ├── simple-mask-f32/
│   │   │   │   ├── graph.nnef
│   │   │   │   ├── io.npz
│   │   │   │   └── runme.sh
│   │   │   ├── simple-non-causal-f32/
│   │   │   │   ├── graph.nnef
│   │   │   │   ├── io.npz
│   │   │   │   └── runme.sh
│   │   │   └── simple-scale-f32/
│   │   │       ├── graph.nnef
│   │   │       ├── io.npz
│   │   │       └── runme.sh
│   │   ├── slice-over-slice-optim-loop/
│   │   │   ├── graph.nnef
│   │   │   └── runme.sh
│   │   ├── softmax/
│   │   │   ├── softmax-change-axis/
│   │   │   │   ├── expected
│   │   │   │   ├── graph.nnef
│   │   │   │   └── runme.sh
│   │   │   ├── softmax-change-axis-1/
│   │   │   │   ├── expected
│   │   │   │   ├── graph.nnef
│   │   │   │   └── runme.sh
│   │   │   └── softmax-quant/
│   │   │       ├── expected/
│   │   │       │   ├── graph.nnef
│   │   │       │   └── graph.quant
│   │   │       ├── model/
│   │   │       │   ├── graph.nnef
│   │   │       │   └── graph.quant
│   │   │       └── runme.sh
│   │   ├── submodel/
│   │   │   ├── expected
│   │   │   ├── graph.nnef
│   │   │   ├── graph.quant
│   │   │   ├── nnet2/
│   │   │   │   ├── graph.nnef
│   │   │   │   └── graph.quant
│   │   │   └── runme.sh
│   │   ├── tdim-cmp/
│   │   │   ├── graph.nnef
│   │   │   └── runme.sh
│   │   ├── test_all_reduce/
│   │   │   ├── io.npz
│   │   │   ├── model.nnef.tgz
│   │   │   └── runme.sh
│   │   ├── test_any_reduce/
│   │   │   ├── io.npz
│   │   │   ├── model.nnef.tgz
│   │   │   └── runme.sh
│   │   ├── test_manage_gru_states/
│   │   │   ├── io.npz
│   │   │   ├── model.nnef.tgz
│   │   │   └── runme.sh
│   │   ├── test_stft_smaller_win/
│   │   │   ├── io.npz
│   │   │   ├── model.nnef.tgz
│   │   │   └── runme.sh
│   │   ├── test_upcast_f32_attn/
│   │   │   ├── io.npz
│   │   │   ├── model.nnef.tgz
│   │   │   └── runme.sh
│   │   ├── tile-with-tdim/
│   │   │   ├── graph.nnef
│   │   │   └── runme.sh
│   │   ├── uniform-mul/
│   │   │   ├── expected
│   │   │   ├── graph.nnef
│   │   │   ├── io.npz
│   │   │   └── runme.sh
│   │   └── variable-in-fragment/
│   │       ├── graph.nnef
│   │       └── runme.sh
│   ├── parakeet-tdt-600m-v3/
│   │   └── ci.sh
│   ├── pre-optimized-graphes/
│   │   ├── .gitignore
│   │   ├── hey_snips_v4_model17/
│   │   │   ├── expected
│   │   │   └── runme.sh
│   │   └── mdl-en-2019-Q3-librispeech/
│   │       ├── expected
│   │       └── runme.sh
│   ├── tf-inceptionv3/
│   │   ├── Cargo.toml
│   │   ├── benches/
│   │   │   └── inceptionv3.rs
│   │   ├── download.sh
│   │   └── src/
│   │       └── lib.rs
│   ├── tf-mobilenet-v2/
│   │   ├── Cargo.toml
│   │   ├── download.sh
│   │   └── src/
│   │       └── lib.rs
│   └── tfl-mobilenet-v2-q/
│       ├── Cargo.toml
│       ├── download.sh
│       └── src/
│           └── lib.rs
├── hir/
│   ├── Cargo.toml
│   ├── LICENSE
│   ├── LICENSE-APACHE
│   ├── LICENSE-MIT
│   └── src/
│       ├── framework.rs
│       ├── infer/
│       │   ├── analyser.rs
│       │   ├── fact.rs
│       │   ├── factoid.rs
│       │   ├── helpers.rs
│       │   ├── mod.rs
│       │   ├── model.rs
│       │   ├── ops.rs
│       │   ├── optim.rs
│       │   └── rules/
│       │       ├── cache.rs
│       │       ├── expr.rs
│       │       ├── mod.rs
│       │       ├── path.rs
│       │       ├── proxies.rs
│       │       └── solver.rs
│       ├── lib.rs
│       ├── macros.rs
│       └── ops/
│           ├── activations.rs
│           ├── array/
│           │   ├── add_dims.rs
│           │   ├── array_feature_extractor.rs
│           │   ├── broadcast.rs
│           │   ├── concat.rs
│           │   ├── constant_like.rs
│           │   ├── constant_of_shape.rs
│           │   ├── crop.rs
│           │   ├── dyn_slice.rs
│           │   ├── flatten.rs
│           │   ├── gather.rs
│           │   ├── gather_elements.rs
│           │   ├── gather_nd.rs
│           │   ├── mod.rs
│           │   ├── pad.rs
│           │   ├── permute_axes.rs
│           │   ├── range.rs
│           │   ├── reshape.rs
│           │   ├── rm_dims.rs
│           │   ├── scatter_elements.rs
│           │   ├── scatter_nd.rs
│           │   ├── shape.rs
│           │   ├── size.rs
│           │   ├── slice.rs
│           │   ├── split.rs
│           │   ├── squeeze.rs
│           │   ├── strided_slice.rs
│           │   └── tile.rs
│           ├── binary.rs
│           ├── cast.rs
│           ├── cnn/
│           │   ├── conv.rs
│           │   ├── mod.rs
│           │   └── pools.rs
│           ├── downsample.rs
│           ├── dummy.rs
│           ├── element_wise.rs
│           ├── expandable.rs
│           ├── identity.rs
│           ├── konst.rs
│           ├── logic.rs
│           ├── matmul.rs
│           ├── mod.rs
│           ├── nn/
│           │   ├── global_pools.rs
│           │   ├── layer_max.rs
│           │   ├── mod.rs
│           │   ├── reduce.rs
│           │   └── softmax.rs
│           ├── quant.rs
│           ├── scan.rs
│           ├── source.rs
│           └── unimpl.rs
├── libcli/
│   ├── Cargo.toml
│   ├── src/
│   │   ├── annotations.rs
│   │   ├── display_params.rs
│   │   ├── draw.rs
│   │   ├── export.rs
│   │   ├── lib.rs
│   │   ├── model.rs
│   │   ├── profile.rs
│   │   ├── tensor.rs
│   │   ├── terminal.rs
│   │   └── time.rs
│   └── validate_wires.py
├── linalg/
│   ├── Cargo.toml
│   ├── LICENSE
│   ├── LICENSE-APACHE
│   ├── LICENSE-MIT
│   ├── README.md
│   ├── arm32/
│   │   ├── armv7neon/
│   │   │   ├── armv7neon_mmm_f32_32x1_core.tmpl
│   │   │   ├── armv7neon_mmm_f32_8x1_core.tmpl
│   │   │   ├── armv7neon_mmm_f32_8x4_core.tmpl
│   │   │   ├── armv7neon_mmm_f32_8x6_core.tmpl
│   │   │   ├── armv7neon_mmm_f32_per_cols.tmpliq
│   │   │   ├── armv7neon_mmm_f32_per_rows.tmpliq
│   │   │   ├── armv7neon_mmm_f32_scalars.tmpliq
│   │   │   ├── armv7neon_mmm_i32_32x1.tmpl
│   │   │   ├── armv7neon_mmm_i32_8x4.tmpl
│   │   │   ├── armv7neon_mmm_i32_per_cols.tmpliq
│   │   │   ├── armv7neon_mmm_i32_per_rows.tmpliq
│   │   │   ├── armv7neon_mmm_i32_scalars.tmpliq
│   │   │   ├── armv7neon_mmm_i32_scale_q8_q15.tmpliq
│   │   │   ├── armv7neon_mmm_q_per_col.tmpliq
│   │   │   ├── armv7neon_mmm_q_per_row.tmpliq
│   │   │   ├── armv7neon_mmm_q_scalar.tmpliq
│   │   │   ├── armv7neon_prefetch.tmpl
│   │   │   ├── armv7neon_sigmoid_f32_4n.tmpl
│   │   │   ├── armv7neon_tanh_f32_4n.tmpl
│   │   │   └── dispatcher.tmpliq
│   │   └── armvfpv2/
│   │       ├── armvfpv2_mmm_f32_4x4.tmpl
│   │       └── dispatcher.tmpliq
│   ├── arm64/
│   │   ├── apple_amx/
│   │   │   ├── apple_amx_mmm_f16_64x1.tmpl
│   │   │   ├── apple_amx_mmm_f16_64x32.tmpl
│   │   │   ├── apple_amx_mmm_f32_32x1.tmpl
│   │   │   ├── apple_amx_mmm_f32_32x32.tmpl
│   │   │   ├── dispatcher.tmpliq
│   │   │   └── instructions.rs
│   │   ├── arm64fp16/
│   │   │   ├── arm64fp16_leaky_relu_f16_8n.tmpl
│   │   │   ├── arm64fp16_mmm_8h_per_col.tmpliq
│   │   │   ├── arm64fp16_mmm_8h_per_row.tmpliq
│   │   │   ├── arm64fp16_mmm_8h_scalar.tmpliq
│   │   │   ├── arm64fp16_mmm_f16_128x1/
│   │   │   │   ├── loop1/
│   │   │   │   │   ├── cortex_a53.tmpli
│   │   │   │   │   └── naive.tmpli
│   │   │   │   └── loop2/
│   │   │   │       └── cortex_a55.tmpli
│   │   │   ├── arm64fp16_mmm_f16_128x1_core.tmpl
│   │   │   ├── arm64fp16_mmm_f16_16x8/
│   │   │   │   ├── loop1/
│   │   │   │   │   └── naive.tmpli
│   │   │   │   └── loop2/
│   │   │   │       └── cortex_a55.tmpli
│   │   │   ├── arm64fp16_mmm_f16_16x8_core.tmpl
│   │   │   ├── arm64fp16_mmm_f16_32x4/
│   │   │   │   ├── loop1/
│   │   │   │   │   └── naive.tmpli
│   │   │   │   └── loop2/
│   │   │   │       └── cortex_a55.tmpli
│   │   │   ├── arm64fp16_mmm_f16_32x4_core.tmpl
│   │   │   ├── arm64fp16_mmm_f16_32x6.core.tmpl
│   │   │   ├── arm64fp16_mmm_f16_64x1.core.tmpl
│   │   │   ├── arm64fp16_mmm_f16_64x3.core.tmpl
│   │   │   ├── arm64fp16_mmm_f16_per_cols.tmpliq
│   │   │   ├── arm64fp16_mmm_f16_per_rows.tmpliq
│   │   │   ├── arm64fp16_mmm_f16_scalars.tmpliq
│   │   │   ├── arm64fp16_mmm_load_tile.tmpliq
│   │   │   ├── arm64fp16_sigmoid_f16_8n.tmpl
│   │   │   ├── arm64fp16_tanh_f16_8n.tmpl
│   │   │   ├── dispatcher.tmpliq
│   │   │   ├── dummy_fmla_no_pragma.S
│   │   │   └── dummy_fmla_pragma.S
│   │   └── arm64simd/
│   │       ├── arm64simd_mmm_4s_per_col.tmpliq
│   │       ├── arm64simd_mmm_4s_per_row.tmpliq
│   │       ├── arm64simd_mmm_4s_scalar.tmpliq
│   │       ├── arm64simd_mmm_f32_12x8/
│   │       │   ├── packed_packed_loop1/
│   │       │   │   ├── ldr_w_no_preload.tmpli
│   │       │   │   ├── ldr_w_preload.tmpli
│   │       │   │   ├── ldr_x_preload.tmpli
│   │       │   │   └── naive.tmpli
│   │       │   └── packed_packed_loop2/
│   │       │       └── cortex_a55.tmpli
│   │       ├── arm64simd_mmm_f32_12x8_core.tmpl
│   │       ├── arm64simd_mmm_f32_16x4/
│   │       │   ├── packed_packed_loop1/
│   │       │   │   ├── cortex_a53.tmpli
│   │       │   │   └── naive.tmpli
│   │       │   └── packed_packed_loop2/
│   │       │       └── cortex_a55.tmpli
│   │       ├── arm64simd_mmm_f32_16x4_core.tmpl
│   │       ├── arm64simd_mmm_f32_24x4/
│   │       │   ├── loop2/
│   │       │   │   └── cortex_a55.tmpli
│   │       │   └── packed_packed_loop1/
│   │       │       ├── cortex_a53.tmpli
│   │       │       ├── cortex_a55.tmpli
│   │       │       └── naive.tmpli
│   │       ├── arm64simd_mmm_f32_24x4_core.tmpl
│   │       ├── arm64simd_mmm_f32_32x1_core.tmpl
│   │       ├── arm64simd_mmm_f32_32x3_core.tmpl
│   │       ├── arm64simd_mmm_f32_64x1/
│   │       │   ├── loop1/
│   │       │   │   ├── cortex_a53.tmpli
│   │       │   │   └── naive.tmpli
│   │       │   └── loop2/
│   │       │       ├── cortex_a55.tmpli
│   │       │       └── naive.tmpli
│   │       ├── arm64simd_mmm_f32_64x1_core.tmpl
│   │       ├── arm64simd_mmm_f32_8x8/
│   │       │   ├── packed_packed_loop1/
│   │       │   │   ├── broken_chains.tmpli
│   │       │   │   ├── ldr_w_no_preload.tmpli
│   │       │   │   ├── ldr_w_preload.tmpli
│   │       │   │   ├── ldr_x_no_preload.tmpli
│   │       │   │   ├── ldr_x_preload.tmpli
│   │       │   │   └── naive.tmpli
│   │       │   └── packed_packed_loop2/
│   │       │       ├── broken_chains.tmpli
│   │       │       └── cortex_a55.tmpli
│   │       ├── arm64simd_mmm_f32_8x8_core.tmpl
│   │       ├── arm64simd_mmm_f32_per_cols.tmpliq
│   │       ├── arm64simd_mmm_f32_per_rows.tmpliq
│   │       ├── arm64simd_mmm_f32_scalars.tmpliq
│   │       ├── arm64simd_mmm_i32_64x1.tmpl
│   │       ├── arm64simd_mmm_i32_8x8.tmpl
│   │       ├── arm64simd_mmm_i32_per_cols.tmpliq
│   │       ├── arm64simd_mmm_i32_per_rows.tmpliq
│   │       ├── arm64simd_mmm_i32_scalars.tmpliq
│   │       ├── arm64simd_mmm_i32_scale_q16_q31.tmpliq
│   │       ├── arm64simd_mmm_load_tile.tmpliq
│   │       ├── arm64simd_sigmoid_f32_4n.tmpl
│   │       ├── arm64simd_tanh_f32_4n.tmpl
│   │       └── dispatcher.tmpliq
│   ├── benches/
│   │   ├── arm32neon.rs
│   │   ├── arm64.rs
│   │   ├── arm64simd.rs
│   │   ├── intel.rs
│   │   ├── leaky_relu.rs
│   │   ├── mat_vec.rs
│   │   ├── mm_for_asr_am.rs
│   │   ├── mm_for_inception.rs
│   │   ├── mm_for_wavenet_hw.rs
│   │   ├── sigmoid.rs
│   │   ├── softmax.rs
│   │   ├── utils.rs
│   │   ├── virtual_im2col.rs
│   │   └── x86_64.rs
│   ├── build.rs
│   ├── cost_model/
│   │   ├── Cargo.toml
│   │   ├── src/
│   │   │   └── main.rs
│   │   └── train/
│   │       ├── README.md
│   │       ├── requirements.txt
│   │       ├── runme.sh
│   │       └── train.py
│   ├── matmul-bench/
│   │   ├── Cargo.toml
│   │   ├── benches/
│   │   │   └── matmul.rs
│   │   ├── build.rs
│   │   ├── c/
│   │   │   ├── packed_tile_4x4.c
│   │   │   ├── packed_tile_8x8.c
│   │   │   ├── tile_1x1.c
│   │   │   ├── tile_2x2.c
│   │   │   ├── tile_4x4.c
│   │   │   └── tile_8x8.c
│   │   └── src/
│   │       └── lib.rs
│   ├── src/
│   │   ├── arm32/
│   │   │   ├── armv7neon.rs
│   │   │   ├── armvfpv2.rs
│   │   │   ├── cortex_a7.rs
│   │   │   ├── cortex_a7.txt
│   │   │   ├── cortex_a9.rs
│   │   │   └── cortex_a9.txt
│   │   ├── arm32.rs
│   │   ├── arm64/
│   │   │   ├── apple_amx.rs
│   │   │   ├── arm64fp16/
│   │   │   │   ├── by_scalar.rs
│   │   │   │   ├── leaky_relu.rs
│   │   │   │   ├── max.rs
│   │   │   │   ├── panel_extract.rs
│   │   │   │   ├── sum.rs
│   │   │   │   └── unicast.rs
│   │   │   ├── arm64fp16.rs
│   │   │   ├── arm64simd/
│   │   │   │   ├── by_scalar.rs
│   │   │   │   ├── leaky_relu.rs
│   │   │   │   ├── max.rs
│   │   │   │   ├── panel_extract.rs
│   │   │   │   ├── softmax.rs
│   │   │   │   ├── sum.rs
│   │   │   │   └── unicast.rs
│   │   │   ├── arm64simd.rs
│   │   │   ├── cortex_a53.rs
│   │   │   ├── cortex_a55.rs
│   │   │   ├── cortex_a72.rs
│   │   │   └── cortex_a73.rs
│   │   ├── arm64.rs
│   │   ├── frame/
│   │   │   ├── block_quant/
│   │   │   │   ├── helpers.rs
│   │   │   │   ├── mod.rs
│   │   │   │   ├── q4_0.rs
│   │   │   │   ├── q8_1.rs
│   │   │   │   ├── storage.rs
│   │   │   │   └── value.rs
│   │   │   ├── by_scalar.rs
│   │   │   ├── element_wise.rs
│   │   │   ├── element_wise_helper.rs
│   │   │   ├── leaky_relu.rs
│   │   │   ├── lut.rs
│   │   │   ├── mmm/
│   │   │   │   ├── cost_model.rs
│   │   │   │   ├── fuse.rs
│   │   │   │   ├── input_store.rs
│   │   │   │   ├── kernel.rs
│   │   │   │   ├── macros.rs
│   │   │   │   ├── mod.rs
│   │   │   │   ├── panel_extract.rs
│   │   │   │   ├── scratch.rs
│   │   │   │   ├── storage.rs
│   │   │   │   └── tests/
│   │   │   │       ├── frame.rs
│   │   │   │       ├── fuse.rs
│   │   │   │       ├── mod.rs
│   │   │   │       ├── packed_packed.rs
│   │   │   │       ├── q_scale.rs
│   │   │   │       └── store.rs
│   │   │   ├── mod.rs
│   │   │   ├── pack.rs
│   │   │   ├── reduce/
│   │   │   │   ├── max.rs
│   │   │   │   ├── mod.rs
│   │   │   │   ├── softmax.rs
│   │   │   │   └── sum.rs
│   │   │   ├── sigmoid.rs
│   │   │   ├── tanh.rs
│   │   │   ├── unicast.rs
│   │   │   └── weights.rs
│   │   ├── generic/
│   │   │   ├── by_scalar.rs
│   │   │   ├── erf.rs
│   │   │   ├── leaky_relu.rs
│   │   │   ├── lut.rs
│   │   │   ├── mmm.rs
│   │   │   ├── reduce.rs
│   │   │   ├── rounding.rs
│   │   │   ├── sigmoid.rs
│   │   │   ├── tanh.rs
│   │   │   └── unicast.rs
│   │   ├── generic.rs
│   │   ├── hwbench/
│   │   │   ├── bandwidth.rs
│   │   │   ├── mod.rs
│   │   │   └── runner.rs
│   │   ├── lib.rs
│   │   ├── multithread.rs
│   │   ├── wasm.rs
│   │   ├── x86_64_fma/
│   │   │   ├── by_scalar.rs
│   │   │   ├── intel.rs
│   │   │   ├── max.rs
│   │   │   ├── mmm.rs
│   │   │   ├── panel_extract.rs
│   │   │   └── softmax.rs
│   │   └── x86_64_fma.rs
│   ├── tests/
│   │   └── virtual_im2col.rs
│   └── x86_64/
│       ├── avx512/
│       │   ├── 10x1/
│       │   │   └── packed_packed_loop1/
│       │   │       ├── avx-512-unroll.tmpli
│       │   │       └── avx-512.tmpli
│       │   ├── 1x1/
│       │   │   └── packed_packed_loop1/
│       │   │       ├── avx-512.tmpli
│       │   │       ├── unroll-16.tmpli
│       │   │       ├── unroll-4.tmpli
│       │   │       ├── unroll-8.tmpli
│       │   │       └── unroll.tmpli
│       │   ├── 1x12/
│       │   │   └── packed_packed_loop1/
│       │   │       └── avx-512.tmpli
│       │   ├── 2x5/
│       │   │   └── packed_packed_loop1/
│       │   │       ├── avx-512-unroll.tmpli
│       │   │       └── avx-512.tmpli
│       │   ├── 2x6/
│       │   │   └── packed_packed_loop1/
│       │   │       ├── avx-512-unroll.tmpli
│       │   │       └── avx-512.tmpli
│       │   ├── 3x4/
│       │   │   └── packed_packed_loop1/
│       │   │       ├── avx-512-unroll.tmpli
│       │   │       └── avx-512.tmpli
│       │   ├── 4x3/
│       │   │   └── packed_packed_loop1/
│       │   │       ├── avx-512-unroll.tmpli
│       │   │       └── avx-512.tmpli
│       │   ├── 5x2/
│       │   │   └── packed_packed_loop1/
│       │   │       ├── avx-512-unroll.tmpli
│       │   │       └── avx-512.tmpli
│       │   ├── 6x1/
│       │   │   └── packed_packed_loop1/
│       │   │       ├── avx-512-unroll.tmpli
│       │   │       └── avx-512.tmpli
│       │   ├── 6x2/
│       │   │   └── packed_packed_loop1/
│       │   │       ├── avx-512-unroll.tmpli
│       │   │       └── avx-512.tmpli
│       │   ├── 7x1/
│       │   │   └── packed_packed_loop1/
│       │   │       ├── avx-512-unroll.tmpli
│       │   │       └── avx-512.tmpli
│       │   ├── 8x1/
│       │   │   └── packed_packed_loop1/
│       │   │       ├── avx-512-unroll.tmpli
│       │   │       └── avx-512.tmpli
│       │   ├── 8x2/
│       │   │   └── packed_packed_loop1/
│       │   │       └── avx-512.tmpli
│       │   ├── 8x8/
│       │   │   └── packed_packed_loop1/
│       │   │       ├── avx-512-unroll.tmpli
│       │   │       └── avx-512.tmpli
│       │   ├── avx512_mmm_f32_128x1.tmpl
│       │   ├── avx512_mmm_f32_16x1.tmpl
│       │   ├── avx512_mmm_f32_16x12.tmpl
│       │   ├── avx512_mmm_f32_16x8.tmpl
│       │   ├── avx512_mmm_f32_32x5.tmpl
│       │   ├── avx512_mmm_f32_32x6.tmpl
│       │   ├── avx512_mmm_f32_48x4.tmpl
│       │   ├── avx512_mmm_f32_64x3.tmpl
│       │   ├── avx512_mmm_f32_80x2.tmpl
│       │   ├── avx512_mmm_load_tile.tmpliq
│       │   ├── dispatcher.tmpliq
│       │   ├── f32_per_cols.tmpliq
│       │   ├── f32_per_rows.tmpliq
│       │   ├── f32_scalars.tmpliq
│       │   ├── i32_per_cols.tmpliq
│       │   ├── i32_per_rows.tmpliq
│       │   ├── i32_scalars.tmpliq
│       │   ├── postamble.tmpliq
│       │   ├── preamble.tmpliq
│       │   ├── sigmoid_f32.tmpl
│       │   ├── tanh_f32.tmpl
│       │   ├── zmm_per_col.tmpliq
│       │   ├── zmm_per_row.tmpliq
│       │   └── zmm_scalar.tmpliq
│       └── fma/
│           ├── 10x1/
│           │   └── packed_packed_loop1/
│           │       ├── avx-unroll.tmpli
│           │       └── avx.tmpli
│           ├── 2x5/
│           │   └── packed_packed_loop1/
│           │       ├── avx-unroll.tmpli
│           │       └── avx.tmpli
│           ├── 2x6/
│           │   └── packed_packed_loop1/
│           │       ├── original-unroll.tmpli
│           │       └── original.tmpli
│           ├── 3x4/
│           │   └── packed_packed_loop1/
│           │       ├── avx-unroll.tmpli
│           │       └── avx.tmpli
│           ├── 4x3/
│           │   └── packed_packed_loop1/
│           │       ├── avx-unroll.tmpli
│           │       └── avx.tmpli
│           ├── 5x2/
│           │   └── packed_packed_loop1/
│           │       ├── avx-unroll.tmpli
│           │       └── avx.tmpli
│           ├── 6x1/
│           │   └── packed_packed_loop1/
│           │       ├── avx-unroll.tmpli
│           │       └── avx.tmpli
│           ├── 6x2/
│           │   └── packed_packed_loop1/
│           │       ├── avx-unroll.tmpli
│           │       └── avx.tmpli
│           ├── 7x1/
│           │   └── packed_packed_loop1/
│           │       ├── avx-unroll.tmpli
│           │       └── avx.tmpli
│           ├── 8x1/
│           │   └── packed_packed_loop1/
│           │       ├── avx-unroll.tmpli
│           │       └── avx.tmpli
│           ├── 8x8/
│           │   └── packed_packed_loop1/
│           │       ├── avx-unroll.tmpli
│           │       └── avx.tmpli
│           ├── avx2_mmm_i32_8x8.tmpl
│           ├── dispatcher.tmpliq
│           ├── fma_mmm_f32_16x5.tmpl
│           ├── fma_mmm_f32_16x6.tmpl
│           ├── fma_mmm_f32_24x4.tmpl
│           ├── fma_mmm_f32_32x1.tmpl
│           ├── fma_mmm_f32_32x3.tmpl
│           ├── fma_mmm_f32_40x2.tmpl
│           ├── fma_mmm_f32_64x1.tmpl
│           ├── fma_mmm_f32_8x8.tmpl
│           ├── fma_mmm_f32_per_cols.tmpliq
│           ├── fma_mmm_f32_per_rows.tmpliq
│           ├── fma_mmm_f32_scalars.tmpliq
│           ├── fma_mmm_i32_per_cols.tmpliq
│           ├── fma_mmm_i32_per_rows.tmpliq
│           ├── fma_mmm_i32_scalars.tmpliq
│           ├── fma_mmm_load_tile.tmpliq
│           ├── fma_mmm_ymm_per_col.tmpliq
│           ├── fma_mmm_ymm_per_row.tmpliq
│           ├── fma_mmm_ymm_scalar.tmpliq
│           ├── fma_sigmoid_f32.tmpl
│           ├── fma_tanh_f32.tmpl
│           ├── postamble.tmpliq
│           └── preamble.tmpliq
├── metal/
│   ├── Cargo.toml
│   ├── README.md
│   ├── benches/
│   │   └── metal_gemm.rs
│   └── src/
│       ├── command_buffer.rs
│       ├── context.rs
│       ├── encoder.rs
│       ├── func_constants.rs
│       ├── kernels/
│       │   ├── array/
│       │   │   ├── array_ops.metal
│       │   │   ├── cast.rs
│       │   │   ├── copy.rs
│       │   │   ├── dispatch.rs
│       │   │   ├── mod.rs
│       │   │   └── rotate_half.rs
│       │   ├── bin_ops.metal
│       │   ├── bin_ops.rs
│       │   ├── conv.metal
│       │   ├── conv.rs
│       │   ├── element_wise.metal
│       │   ├── element_wise.rs
│       │   ├── matmul/
│       │   │   ├── basic/
│       │   │   │   ├── basic_mat_mul.metal
│       │   │   │   └── mod.rs
│       │   │   ├── ggml_gemm/
│       │   │   │   ├── README.md
│       │   │   │   ├── ggml_mm_mv.metal
│       │   │   │   └── mod.rs
│       │   │   ├── mfa/
│       │   │   │   ├── libMetalFlashAttention-ios.metallib
│       │   │   │   ├── libMetalFlashAttention-macos.metallib
│       │   │   │   └── mod.rs
│       │   │   ├── mlx_gemm/
│       │   │   │   ├── mlx_gemm.metal
│       │   │   │   ├── mlx_gemv.metal
│       │   │   │   └── mod.rs
│       │   │   └── mod.rs
│       │   ├── mod.rs
│       │   ├── nn/
│       │   │   ├── apply_rope.rs
│       │   │   ├── gelu_approximate.rs
│       │   │   ├── leaky_relu.rs
│       │   │   ├── mod.rs
│       │   │   ├── nn_ops.metal
│       │   │   ├── reduce.rs
│       │   │   ├── rms_norm.rs
│       │   │   ├── scaled_masked_softmax.rs
│       │   │   ├── silu.rs
│       │   │   └── softmax.rs
│       │   └── utils.rs
│       ├── lib.rs
│       ├── ops/
│       │   ├── conv.rs
│       │   ├── fused_axis_op.rs
│       │   ├── gemm.rs
│       │   └── mod.rs
│       ├── rewrite_rules/
│       │   ├── add_matmul_broadcast.rs
│       │   ├── fuse_axis_op.rs
│       │   ├── mod.rs
│       │   └── untranspose_matmul_output.rs
│       ├── tensor.rs
│       ├── tests.rs
│       ├── transform.rs
│       └── utils.rs
├── nnef/
│   ├── Cargo.toml
│   ├── LICENSE
│   ├── LICENSE-APACHE
│   ├── LICENSE-MIT
│   ├── cli/
│   │   ├── Cargo.toml
│   │   └── src/
│   │       └── main.rs
│   ├── nnef-resources/
│   │   ├── Cargo.toml
│   │   ├── src/
│   │   │   ├── json_loader.rs
│   │   │   └── lib.rs
│   │   └── tests/
│   │       ├── nnef_with_json/
│   │       │   ├── graph.nnef
│   │       │   └── src_config.json
│   │       └── test_json_resource.rs
│   ├── src/
│   │   ├── ast/
│   │   │   ├── dump.rs
│   │   │   ├── dump_doc.rs
│   │   │   ├── parse.rs
│   │   │   └── quant.rs
│   │   ├── ast.rs
│   │   ├── deser.rs
│   │   ├── framework.rs
│   │   ├── lib.rs
│   │   ├── liquid.rs
│   │   ├── ops/
│   │   │   ├── core/
│   │   │   │   ├── broadcast.rs
│   │   │   │   ├── cast.rs
│   │   │   │   ├── complex.rs
│   │   │   │   ├── downsample.rs
│   │   │   │   ├── dyn_slice.rs
│   │   │   │   ├── einsum.rs
│   │   │   │   ├── fft.rs
│   │   │   │   ├── gather.rs
│   │   │   │   ├── gelu_approximate.rs
│   │   │   │   ├── is_inf.rs
│   │   │   │   ├── matmul.rs
│   │   │   │   ├── one_hot.rs
│   │   │   │   ├── qconv.rs
│   │   │   │   ├── qmatmul.rs
│   │   │   │   ├── range.rs
│   │   │   │   ├── reduce.rs
│   │   │   │   ├── rms_norm.rs
│   │   │   │   ├── scan.rs
│   │   │   │   ├── scatter.rs
│   │   │   │   ├── shape_of.rs
│   │   │   │   ├── silu.rs
│   │   │   │   ├── softmax.rs
│   │   │   │   ├── source.rs
│   │   │   │   ├── submodel.rs
│   │   │   │   ├── topk.rs
│   │   │   │   └── trilu.rs
│   │   │   ├── core.rs
│   │   │   ├── mod.rs
│   │   │   ├── nnef/
│   │   │   │   ├── deser.rs
│   │   │   │   ├── mod.rs
│   │   │   │   └── ser.rs
│   │   │   └── resource.rs
│   │   ├── registry.rs
│   │   ├── resource.rs
│   │   ├── ser.rs
│   │   ├── tensors.rs
│   │   └── transform.rs
│   ├── stdlib.nnef
│   └── tests/
│       ├── alexnet.nnef
│       └── parse.rs
├── onnx/
│   ├── Cargo.toml
│   ├── LICENSE
│   ├── LICENSE-APACHE
│   ├── LICENSE-MIT
│   ├── benches/
│   │   ├── linear_classifier.rs
│   │   └── linear_regressor.rs
│   ├── build-proto.rs
│   ├── protos/
│   │   └── onnx/
│   │       ├── onnx-operators.proto3
│   │       ├── onnx.proto
│   │       └── onnx.proto3
│   ├── src/
│   │   ├── data_resolver.rs
│   │   ├── lib.rs
│   │   ├── model.rs
│   │   ├── ops/
│   │   │   ├── array/
│   │   │   │   ├── compress.rs
│   │   │   │   ├── mod.rs
│   │   │   │   ├── nonzero.rs
│   │   │   │   ├── one_hot.rs
│   │   │   │   ├── pad.rs
│   │   │   │   ├── shape.rs
│   │   │   │   ├── slice.rs
│   │   │   │   ├── split.rs
│   │   │   │   ├── squeeze.rs
│   │   │   │   ├── topk.rs
│   │   │   │   ├── trilu.rs
│   │   │   │   └── unsqueeze.rs
│   │   │   ├── cast.rs
│   │   │   ├── cumsum.rs
│   │   │   ├── d2s.rs
│   │   │   ├── einsum.rs
│   │   │   ├── fft.rs
│   │   │   ├── grid_sample.rs
│   │   │   ├── logic.rs
│   │   │   ├── math/
│   │   │   │   ├── clip.rs
│   │   │   │   ├── gemm.rs
│   │   │   │   ├── mat_mul_integer.rs
│   │   │   │   ├── pow.rs
│   │   │   │   └── rem.rs
│   │   │   ├── math.rs
│   │   │   ├── ml/
│   │   │   │   ├── category_mapper.rs
│   │   │   │   ├── linear_classifier.rs
│   │   │   │   ├── linear_regressor.rs
│   │   │   │   ├── mod.rs
│   │   │   │   ├── normalizer.rs
│   │   │   │   └── tree_ensemble_classifier.rs
│   │   │   ├── mod.rs
│   │   │   ├── multinomial.rs
│   │   │   ├── nn/
│   │   │   │   ├── batch_norm.rs
│   │   │   │   ├── conv_transpose.rs
│   │   │   │   ├── dropout.rs
│   │   │   │   ├── instance_norm.rs
│   │   │   │   ├── layer_norm.rs
│   │   │   │   ├── lrn.rs
│   │   │   │   ├── mod.rs
│   │   │   │   └── reduce.rs
│   │   │   ├── non_max_suppression.rs
│   │   │   ├── quant.rs
│   │   │   ├── random.rs
│   │   │   ├── rec/
│   │   │   │   ├── common.rs
│   │   │   │   ├── gru.rs
│   │   │   │   ├── lstm.rs
│   │   │   │   ├── rnn.rs
│   │   │   │   └── scan.rs
│   │   │   ├── rec.rs
│   │   │   ├── resize.rs
│   │   │   └── s2d.rs
│   │   ├── pb_helpers.rs
│   │   ├── prost/
│   │   │   └── onnx.rs
│   │   └── tensor.rs
│   └── test_cases/
│       ├── byte_sb_bidi_lstm/
│       │   ├── README.md
│       │   ├── generate_io.py
│       │   ├── io.npz
│       │   └── model.onnx
│       ├── deconv_group/
│       │   ├── io.npz
│       │   ├── model.onnx
│       │   └── vars.sh
│       ├── lgbm_classifier_tensor/
│       │   ├── generate_io.py
│       │   ├── io.npz
│       │   ├── model.onnx
│       │   └── vars.sh
│       ├── lgbm_regressor_tensor/
│       │   ├── generate_io.py
│       │   ├── io.npz
│       │   ├── model.onnx
│       │   └── vars.sh
│       ├── linear_classifier/
│       │   └── model.onnx
│       ├── linear_regressor/
│       │   └── model.onnx
│       ├── qlstm_3-2-3_T3_S1/
│       │   ├── final.mdl
│       │   ├── io.npz
│       │   ├── model.onnx
│       │   ├── model.raw
│       │   └── vars.sh
│       ├── qrelu_1/
│       │   ├── final.mdl
│       │   ├── io.npz
│       │   ├── model.onnx
│       │   ├── model.raw
│       │   └── vars.sh
│       ├── qrelu_2/
│       │   ├── final.mdl
│       │   ├── io.npz
│       │   ├── model.onnx
│       │   ├── model.raw
│       │   └── vars.sh
│       ├── qsigmoid_1/
│       │   ├── final.mdl
│       │   ├── io.npz
│       │   ├── model.onnx
│       │   ├── model.raw
│       │   └── vars.sh
│       ├── qsigmoid_2/
│       │   ├── final.mdl
│       │   ├── io.npz
│       │   ├── model.onnx
│       │   ├── model.raw
│       │   └── vars.sh
│       ├── qtanh_1/
│       │   ├── final.mdl
│       │   ├── io.npz
│       │   ├── model.onnx
│       │   ├── model.raw
│       │   └── vars.sh
│       ├── qtanh_2/
│       │   ├── final.mdl
│       │   ├── io.npz
│       │   ├── model.onnx
│       │   ├── model.raw
│       │   └── vars.sh
│       ├── qtdnn_10x5_101_i32_biases/
│       │   ├── final.mdl
│       │   ├── io.npz
│       │   ├── model.onnx
│       │   ├── model.raw
│       │   └── vars.sh
│       ├── run_all.sh
│       ├── tinyyolov2/
│       │   ├── io.npz
│       │   └── vars.sh
│       ├── transformer-mlm/
│       │   ├── generate_io.py
│       │   ├── io.npz
│       │   └── vars.sh
│       ├── xgboost_classifier_tree/
│       │   ├── generate_io.py
│       │   ├── io.npz
│       │   ├── model.onnx
│       │   └── vars.sh
│       └── xgboost_regressor_tree/
│           ├── generate_io.py
│           ├── io.npz
│           ├── model.onnx
│           └── vars.sh
├── onnx-opl/
│   ├── Cargo.toml
│   ├── LICENSE
│   ├── LICENSE-APACHE
│   ├── LICENSE-MIT
│   └── src/
│       ├── grid_sample.rs
│       ├── lib.rs
│       ├── lrn.rs
│       ├── ml/
│       │   ├── category_mapper.rs
│       │   ├── mod.rs
│       │   ├── tree.rs
│       │   └── tree_ensemble_classifier.rs
│       ├── multinomial.rs
│       ├── non_max_suppression.rs
│       ├── random.rs
│       └── resize.rs
├── post-release.sh
├── pulse/
│   ├── Cargo.toml
│   ├── LICENSE
│   ├── LICENSE-APACHE
│   ├── LICENSE-MIT
│   └── src/
│       ├── fact.rs
│       ├── lib.rs
│       ├── macros.rs
│       ├── model.rs
│       └── ops/
│           ├── array/
│           │   ├── broadcast.rs
│           │   ├── concat.rs
│           │   ├── mask.rs
│           │   ├── mod.rs
│           │   ├── pad.rs
│           │   └── slice.rs
│           ├── cnn/
│           │   ├── conv.rs
│           │   ├── deconv.rs
│           │   ├── mod.rs
│           │   └── pools.rs
│           ├── delay.rs
│           ├── downsample.rs
│           ├── dummy.rs
│           ├── fft.rs
│           ├── identity.rs
│           ├── mask.rs
│           ├── mod.rs
│           ├── scan.rs
│           ├── slice.rs
│           └── source.rs
├── pulse-opl/
│   ├── Cargo.toml
│   ├── LICENSE
│   ├── LICENSE-APACHE
│   ├── LICENSE-MIT
│   └── src/
│       ├── concat.rs
│       ├── deconv_delay.rs
│       ├── delay.rs
│       ├── lib.rs
│       ├── mask.rs
│       ├── pad.rs
│       └── slice.rs
├── release.sh
├── rustfmt.toml
├── tensorflow/
│   ├── Cargo.toml
│   ├── LICENSE
│   ├── LICENSE-APACHE
│   ├── LICENSE-MIT
│   ├── benches/
│   │   └── hey_snips_3.pb
│   ├── build-proto.rs
│   ├── examples/
│   │   └── plus3.rs
│   ├── protos/
│   │   └── tensorflow/
│   │       └── core/
│   │           ├── framework/
│   │           │   ├── attr_value.proto
│   │           │   ├── function.proto
│   │           │   ├── graph.proto
│   │           │   ├── node_def.proto
│   │           │   ├── op_def.proto
│   │           │   ├── resource_handle.proto
│   │           │   ├── tensor.proto
│   │           │   ├── tensor_shape.proto
│   │           │   ├── types.proto
│   │           │   ├── variable.proto
│   │           │   └── versions.proto
│   │           └── protobuf/
│   │               ├── meta_graph.proto
│   │               ├── saved_model.proto
│   │               ├── saved_object_graph.proto
│   │               ├── saver.proto
│   │               ├── struct.proto
│   │               └── trackable_object_graph.proto
│   ├── src/
│   │   ├── conform/
│   │   │   ├── mod.rs
│   │   │   └── tf.rs
│   │   ├── lib.rs
│   │   ├── model.rs
│   │   ├── ops/
│   │   │   ├── array/
│   │   │   │   ├── concatv2.rs
│   │   │   │   ├── expand_dims.rs
│   │   │   │   ├── fill.rs
│   │   │   │   ├── gather_nd.rs
│   │   │   │   ├── gather_v2.rs
│   │   │   │   ├── mod.rs
│   │   │   │   ├── pack.rs
│   │   │   │   ├── pad.rs
│   │   │   │   ├── squeeze.rs
│   │   │   │   └── transpose.rs
│   │   │   ├── control_flow.rs
│   │   │   ├── logic.rs
│   │   │   ├── math/
│   │   │   │   └── reduce.rs
│   │   │   ├── math.rs
│   │   │   ├── mod.rs
│   │   │   ├── nn/
│   │   │   │   ├── conv2d.rs
│   │   │   │   ├── dw_conv2d.rs
│   │   │   │   ├── fused_batch_norm.rs
│   │   │   │   ├── mod.rs
│   │   │   │   ├── pools.rs
│   │   │   │   └── s2b/
│   │   │   │       ├── mod.rs
│   │   │   │       ├── raw.rs
│   │   │   │       └── unary.rs
│   │   │   ├── quant.rs
│   │   │   ├── random/
│   │   │   │   ├── mod.rs
│   │   │   │   ├── philox.rs
│   │   │   │   └── random_uniform.rs
│   │   │   └── rec/
│   │   │       ├── block_lstm.rs
│   │   │       └── mod.rs
│   │   ├── prost/
│   │   │   ├── google.protobuf.rs
│   │   │   └── tensorflow.rs
│   │   ├── tensor.rs
│   │   └── tfpb.rs
│   └── tests/
│       ├── models/
│       │   └── plus3.pb
│       ├── ops_array_pack.rs
│       ├── ops_array_strided_slice.proptest-regressions
│       ├── ops_array_strided_slice.rs
│       ├── ops_fake_quant_with_min_max_vars.rs
│       ├── ops_nn_conv2d.proptest-regressions
│       ├── ops_nn_conv2d.rs
│       ├── ops_nn_dwconv2d.proptest-regressions
│       ├── ops_nn_dwconv2d.rs
│       ├── ops_nn_pools.proptest-regressions
│       ├── ops_nn_pools.rs
│       ├── ops_nn_space_to_batch.proptest-regressions
│       ├── ops_nn_space_to_batch.rs
│       ├── ops_random_uniform.rs
│       └── utils/
│           └── mod.rs
├── test-rt/
│   ├── infra/
│   │   ├── Cargo.toml
│   │   └── src/
│   │       └── lib.rs
│   ├── suite-onnx/
│   │   ├── Cargo.toml
│   │   ├── node.txt
│   │   ├── pytorch-converted.txt
│   │   ├── pytorch-operator.txt
│   │   ├── simple.txt
│   │   └── src/
│   │       └── lib.rs
│   ├── suite-unit/
│   │   ├── Cargo.toml
│   │   └── src/
│   │       ├── apply_rope.rs
│   │       ├── bin_einsum.rs
│   │       ├── binary.rs
│   │       ├── conv_f16.rs
│   │       ├── conv_f32.rs
│   │       ├── conv_q.rs
│   │       ├── deconv.rs
│   │       ├── downsample.rs
│   │       ├── elmwise.rs
│   │       ├── gelu_approximate.rs
│   │       ├── lib.rs
│   │       ├── matmul_q40.rs
│   │       ├── q_binary.rs
│   │       ├── q_elmwise.rs
│   │       ├── q_flavours.rs
│   │       ├── q_helpers.rs
│   │       ├── rms_norm.rs
│   │       ├── scaled_masked_softmax.rs
│   │       ├── sdpa.rs
│   │       ├── silu.rs
│   │       └── slice.rs
│   ├── test-blas/
│   │   ├── Cargo.toml
│   │   ├── build.rs
│   │   ├── src/
│   │   │   └── lib.rs
│   │   └── suite.rs
│   ├── test-cuda/
│   │   ├── Cargo.toml
│   │   ├── build.rs
│   │   ├── src/
│   │   │   └── lib.rs
│   │   └── suite.rs
│   ├── test-f16/
│   │   ├── Cargo.toml
│   │   ├── build.rs
│   │   ├── src/
│   │   │   └── lib.rs
│   │   └── suite.rs
│   ├── test-metal/
│   │   ├── Cargo.toml
│   │   ├── build.rs
│   │   ├── ggml_suite.rs
│   │   ├── src/
│   │   │   └── lib.rs
│   │   └── suite.rs
│   ├── test-nnef-cycle/
│   │   ├── Cargo.toml
│   │   ├── build.rs
│   │   ├── src/
│   │   │   └── lib.rs
│   │   └── suite.rs
│   ├── test-onnx-core/
│   │   ├── Cargo.toml
│   │   ├── build.rs
│   │   ├── debug-utils/
│   │   │   ├── Cargo.toml
│   │   │   ├── README.md
│   │   │   ├── save_all.py
│   │   │   └── src/
│   │   │       └── main.rs
│   │   ├── include-passing-ignored.sh
│   │   └── src/
│   │       ├── bin/
│   │       │   └── reset-test-list.rs
│   │       └── lib.rs
│   ├── test-tflite/
│   │   ├── Cargo.toml
│   │   ├── build.rs
│   │   ├── src/
│   │   │   ├── lib.rs
│   │   │   └── tflite_runtime.rs
│   │   └── suite.rs
│   └── test-unit-core/
│       ├── Cargo.toml
│       ├── build.rs
│       └── src/
│           ├── lib.rs
│           └── main.rs
├── test-suite.sh
├── tflite/
│   ├── Cargo.toml
│   ├── Readme.md
│   ├── schema/
│   │   └── tflite.fbs
│   └── src/
│       ├── lib.rs
│       ├── model.rs
│       ├── ops/
│       │   ├── array.rs
│       │   ├── cnn.rs
│       │   ├── element_wise.rs
│       │   ├── math.rs
│       │   ├── mod.rs
│       │   └── nn.rs
│       ├── registry.rs
│       ├── rewriter.rs
│       ├── ser.rs
│       ├── tensors.rs
│       └── tflite_generated.rs
├── transformers/
│   ├── Cargo.toml
│   └── src/
│       ├── lib.rs
│       ├── ops/
│       │   ├── apply_rope.rs
│       │   ├── dyn_kv_cache.rs
│       │   ├── flash_sdpa.rs
│       │   ├── mod.rs
│       │   ├── scaled_masked_softmax.rs
│       │   ├── sdpa.rs
│       │   └── streamed_sdpa.rs
│       └── rewriter.rs
└── yank.sh